diff --git a/.github/actions/rust-toolchain-setup/action.yml b/.github/actions/rust-toolchain-setup/action.yml
deleted file mode 100644
index bf73fede16c7f..0000000000000
--- a/.github/actions/rust-toolchain-setup/action.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-# yaml-language-server: $schema=https://json.schemastore.org/github-action.json
-
-name: 'Rust toolchain setup'
-description: 'Common setup steps for GitHub workflows for Rust projects'
-
-runs:
- using: composite
- steps:
- - uses: dtolnay/rust-toolchain@1.71.0
- with:
- components: clippy, rustfmt
- - uses: extractions/setup-just@v1
- with:
- just-version: '1.15.0' # optional semver specification, otherwise latest
-
- ###
- ### Linux setup
- ###
- - name: rustup
- # We need to use the nightly rust tool change to enable registry-auth / to connect to ADO feeds.
- if: ${{ (runner.os == 'Linux') }}
- run: |
- rustup set profile minimal
- rustup install
- shell: bash
- # - name: Cargo login
- # if: ${{ (runner.os == 'Linux') }}
- # run: just cargo-login-ci
- # shell: bash
-
- ###
- ### Windows setup
- ###
- - name: rustup
- # We need to use the nightly rust tool change to enable registry-auth / to connect to ADO feeds.
- if: ${{ (runner.os == 'Windows') }}
- run: |
- rustup set profile minimal
- rustup install
- shell: pwsh
- # - name: Cargo login
- # if: ${{ (runner.os == 'Windows') }}
- # run: just cargo-login-ci-windows
- # shell: pwsh
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index d3ecf44fe5733..e4d1b91bab736 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -37,7 +37,7 @@ jobs:
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
- uses: github/codeql-action/init@v2
+ uses: github/codeql-action/init@v3
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
@@ -47,11 +47,19 @@ jobs:
# Details on CodeQL's query packs refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
queries: security-extended,security-and-quality
+ # Setup Java to use a version that is not too old for the project
+ - if: ${{ matrix.language == 'java' }}
+ name: Setup Java 11
+ uses: actions/setup-java@v4
+ with:
+ java-version: '11'
+ distribution: 'microsoft'
+
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- if: ${{ matrix.language != 'cpp' }}
name: Autobuild
- uses: github/codeql-action/autobuild@v2
+ uses: github/codeql-action/autobuild@v3
- name: Perform CodeQL Analysis
- uses: github/codeql-action/analyze@v2
+ uses: github/codeql-action/analyze@v3
diff --git a/.github/workflows/generated_fake_win_gpu_ci.yml b/.github/workflows/generated_fake_win_gpu_ci.yml
deleted file mode 100644
index 4bc324cba0307..0000000000000
--- a/.github/workflows/generated_fake_win_gpu_ci.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-# Don't edit this file manully.
-# Run generate-skip-doc-change.py to generate it.
-
-name: Windows GPU CI Pipeline
-on:
- pull_request:
- paths:
- - docs/**
- - README.md
- - CONTRIBUTING.md
- - BUILD.md
-
-jobs:
- job1:
- name: cuda build_x64_RelWithDebInfo
- runs-on: ubuntu-latest
- steps:
- - run: 'echo "No build required, only documentation changed"'
-
- job2:
- name: dml build_x64_RelWithDebInfo
- runs-on: ubuntu-latest
- steps:
- - run: 'echo "No build required, only documentation changed"'
-
- job3:
- name: training build_x64_RelWithDebInfo
- runs-on: ubuntu-latest
- steps:
- - run: 'echo "No build required, only documentation changed"'
-
- job4:
- name: kernelDocumentation build_x64_RelWithDebInfo
- runs-on: ubuntu-latest
- steps:
- - run: 'echo "No build required, only documentation changed"'
diff --git a/.github/workflows/gradle-wrapper-validation.yml b/.github/workflows/gradle-wrapper-validation.yml
index 03ea773a25130..73df5e31fda63 100644
--- a/.github/workflows/gradle-wrapper-validation.yml
+++ b/.github/workflows/gradle-wrapper-validation.yml
@@ -11,4 +11,4 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- - uses: gradle/wrapper-validation-action@v1
+ - uses: gradle/wrapper-validation-action@v3
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 4a4e286071ff5..a196226a4b836 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -3,11 +3,14 @@ on:
issues:
types: [opened, edited]
+permissions:
+ issues: write
+
jobs:
triage:
runs-on: ubuntu-latest
steps:
- - uses: github/issue-labeler@v3.2
+ - uses: github/issue-labeler@v3.4
with:
repo-token: "${{ secrets.GITHUB_TOKEN }}"
configuration-path: .github/labeler.yml
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 432c789e943b5..34911cfc7972e 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -36,7 +36,7 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Setup Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
# Version range or exact version of Python to use, using SemVer's version range syntax. Reads from .python-version if unset.
python-version: "3.10"
@@ -65,7 +65,7 @@ jobs:
- name: Upload SARIF file
if: always()
continue-on-error: true
- uses: github/codeql-action/upload-sarif@v2
+ uses: github/codeql-action/upload-sarif@v3
with:
# Path to SARIF file relative to the root of the repository
sarif_file: lintrunner.sarif
@@ -93,7 +93,10 @@ jobs:
github_token: ${{ secrets.github_token }}
reporter: github-pr-check
level: warning
- flags: --linelength=120 --exclude=java/src/main/native/*.c
+ flags: --linelength=120
+ --exclude=java/src/main/native/*.c
+ --exclude=onnxruntime/core/mlas/inc/*
+ --exclude=onnxruntime/core/mlas/lib/*
filter: "-runtime/references"
lint-js:
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
deleted file mode 100644
index 7b314d845d9b4..0000000000000
--- a/.github/workflows/linux.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-name: Linux_CI
-on:
- push:
- branches:
- - main
- - rel-*
- pull_request:
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
- cancel-in-progress: true
-
-jobs:
- Onnxruntime-TVM:
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v4
- with:
- submodules: true
- - uses: actions/setup-python@v4
- with:
- python-version: '3.8.x'
- architecture: 'x64'
- - name: 'Setup TVM EP requirements'
- run: |
- set -e -x
- sudo apt-get update
- sudo apt-get install -y libtinfo-dev zlib1g-dev build-essential libedit-dev libxml2-dev nasm
- python3 -m pip install -r ${{ github.workspace }}/tools/ci_build/github/linux/tvm/requirements.txt
- - name: 'Build and Test'
- run: |
- python3 ${{ github.workspace }}/tools/ci_build/build.py --build_dir build --config Release --skip_submodule_sync --parallel --enable_pybind --disable_contrib_ops --disable_ml_ops --skip_onnx_tests --use_tvm --use_tvm_hash --ctest_path ""
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
new file mode 100644
index 0000000000000..8aaec8adef979
--- /dev/null
+++ b/.github/workflows/mac.yml
@@ -0,0 +1,109 @@
+name: Mac_CI
+
+on:
+ push:
+ branches:
+ - main
+ - rel-*
+ pull_request:
+ branches:
+ - main
+ - rel-*
+ workflow_dispatch:
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+
+env:
+ python_version: 3.11
+ xcode_version: 15.2
+
+jobs:
+ ARM64:
+ runs-on: macos-14
+
+ timeout-minutes: 60
+
+ steps:
+ - uses: actions/setup-python@v5
+ with:
+ python-version: ${{ env.python_version }}
+
+ - name: Verify ARM64 machine
+ shell: python
+ run: |
+ import platform
+ assert platform.machine() == "arm64", "This job expects to be run on an ARM64 machine."
+
+ - name: Use Xcode ${{ env.xcode_version }}
+ shell: bash
+ run: |
+ XCODE_DEVELOPER_DIR="/Applications/Xcode_${{ env.xcode_version }}.app/Contents/Developer"
+ sudo xcode-select --switch "${XCODE_DEVELOPER_DIR}"
+
+ - uses: actions/checkout@v4
+
+ - name: Build and test
+ shell: bash
+ run: |
+ python ./tools/ci_build/build.py \
+ --build_dir ./build \
+ --update \
+ --build --parallel \
+ --test \
+ --build_shared_lib \
+ --build_objc \
+ --use_xnnpack \
+ --use_binskim_compliant_compile_flags
+
+ # TODO add --use_coreml once unit test failures are addressed
+
+ Objective-C-StaticAnalysis:
+ runs-on: macos-14
+
+ timeout-minutes: 30
+
+ steps:
+ - uses: actions/setup-python@v5
+ with:
+ python-version: ${{ env.python_version }}
+
+ - name: Use Xcode ${{ env.xcode_version }}
+ shell: bash
+ run: |
+ XCODE_DEVELOPER_DIR="/Applications/Xcode_${{ env.xcode_version }}.app/Contents/Developer"
+ sudo xcode-select --switch "${XCODE_DEVELOPER_DIR}"
+
+ - uses: actions/checkout@v4
+
+ - name: Generate compile_commands.json and ONNX protobuf files
+ shell: bash
+ run: |
+ python ./tools/ci_build/build.py \
+ --build_dir ./build \
+ --cmake_generator "Unix Makefiles" \
+ --config Debug \
+ --build_shared_lib \
+ --use_coreml \
+ --build_objc \
+ --enable_training_apis \
+ --cmake_extra_defines CMAKE_EXPORT_COMPILE_COMMANDS=ON \
+ --use_binskim_compliant_compile_flags \
+ --update \
+ --build --parallel \
+ --target onnx_proto
+
+ - name: Analyze Objective-C/C++ source code
+ shell: bash
+ run: |
+ CLANG_TIDY_CHECKS="-*,clang-analyzer-*"
+
+ "$(brew --prefix llvm@15)/bin/clang-tidy" \
+ -p=./build/Debug \
+ --checks="${CLANG_TIDY_CHECKS}" \
+ --warnings-as-errors="${CLANG_TIDY_CHECKS}" \
+ --header-filter="objectivec/include|objectivec|onnxruntime/core" \
+ ./objectivec/*.mm \
+ ./onnxruntime/core/platform/apple/logging/apple_log_sink.mm \
+ ./onnxruntime/core/providers/coreml/model/*.mm
diff --git a/.github/workflows/publish-c-apidocs.yml b/.github/workflows/publish-c-apidocs.yml
index 0a3e9ed2594c1..b097cdbd9a55c 100644
--- a/.github/workflows/publish-c-apidocs.yml
+++ b/.github/workflows/publish-c-apidocs.yml
@@ -45,7 +45,7 @@ jobs:
rm -rf site/docs/api/c
mv build/doxygen/html _site/docs/api/c
- name: Upload new site
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: onnxruntime-c-apidocs
path: _site
diff --git a/.github/workflows/publish-csharp-apidocs.yml b/.github/workflows/publish-csharp-apidocs.yml
index 9b9ca924bd008..5bc21595bf882 100644
--- a/.github/workflows/publish-csharp-apidocs.yml
+++ b/.github/workflows/publish-csharp-apidocs.yml
@@ -26,7 +26,7 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Setup .NET
- uses: actions/setup-dotnet@v3
+ uses: actions/setup-dotnet@v4
with:
dotnet-version: 6.0.x
- name: Restore dependencies
@@ -37,7 +37,7 @@ jobs:
wget https://github.com/dotnet/docfx/releases/download/v${DOCFXVERSION}/docfx-linux-x64-v${DOCFXVERSION}.zip -O build/docfx/docfx.zip
unzip build/docfx/docfx.zip -d build/docfx
- name: Install NuGet
- uses: nuget/setup-nuget@v1
+ uses: nuget/setup-nuget@v2
- name: Build Documentation
run: |
build/docfx/docfx metadata csharp/ApiDocs/docfx.json
@@ -51,7 +51,7 @@ jobs:
rm -rf _site/docs/api/csharp
mv csharp/ApiDocs/csharp _site/docs/api/csharp
- name: Upload docs artifact
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: onnxruntime-csharp-apidocs
path: _site
diff --git a/.github/workflows/publish-java-apidocs.yml b/.github/workflows/publish-java-apidocs.yml
index 9ea9bda7e7c53..3e553049a186e 100644
--- a/.github/workflows/publish-java-apidocs.yml
+++ b/.github/workflows/publish-java-apidocs.yml
@@ -25,12 +25,12 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Set up JDK 11
- uses: actions/setup-java@v3
+ uses: actions/setup-java@v4
with:
java-version: '11'
distribution: 'adopt'
- name: Build with Gradle
- uses: gradle/gradle-build-action@v2
+ uses: gradle/gradle-build-action@v3
with:
build-root-directory: java
gradle-executable: java/gradlew
@@ -43,7 +43,7 @@ jobs:
mkdir -p _site/docs/api
mv java/build/docs/javadoc _site/docs/api/java
- name: Upload new site
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: onnxruntime-java-apidocs
path: _site
diff --git a/.github/workflows/publish-js-apidocs.yml b/.github/workflows/publish-js-apidocs.yml
index ba8bfd718abfa..db021106a6554 100644
--- a/.github/workflows/publish-js-apidocs.yml
+++ b/.github/workflows/publish-js-apidocs.yml
@@ -25,7 +25,7 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Setup Node.js
- uses: actions/setup-node@v3
+ uses: actions/setup-node@v4
with:
node-version: 18
- name: Generate JS docs
@@ -43,7 +43,7 @@ jobs:
mkdir -p _site/docs/api
mv js/common/docs _site/docs/api/js
- name: Upload docs artifact
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: onnxruntime-node-apidocs
path: _site
diff --git a/.github/workflows/publish-objectivec-apidocs.yml b/.github/workflows/publish-objectivec-apidocs.yml
index 1b327eebfa8a8..ebacd38f1f882 100644
--- a/.github/workflows/publish-objectivec-apidocs.yml
+++ b/.github/workflows/publish-objectivec-apidocs.yml
@@ -21,7 +21,7 @@ permissions:
jobs:
build:
name: Generate Objective-C API docs
- runs-on: macos-13
+ runs-on: macos-latest
steps:
- uses: actions/checkout@v4
@@ -44,7 +44,7 @@ jobs:
shell: bash
- name: Upload new site
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: onnxruntime-objectivec-apidocs
path: ./_site
diff --git a/.github/workflows/publish-python-apidocs.yml b/.github/workflows/publish-python-apidocs.yml
index ab9d4781afb83..e98d22450c5b0 100644
--- a/.github/workflows/publish-python-apidocs.yml
+++ b/.github/workflows/publish-python-apidocs.yml
@@ -49,7 +49,7 @@ jobs:
mkdir -p _site/docs/api/
mv build/docs/html _site/docs/api/python
- name: Upload docs artifact
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: onnxruntime-python-apidocs
path: _site
diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml
deleted file mode 100644
index 725c40c2ded53..0000000000000
--- a/.github/workflows/rust-ci.yml
+++ /dev/null
@@ -1,132 +0,0 @@
-name: Rust
-
-on: [pull_request]
-
-env:
- CARGO_TERM_COLOR: always
- RUST_LOG: onnxruntime=debug,onnxruntime-sys=debug
- RUST_BACKTRACE: 1
- MANIFEST_PATH: ${{ github.workspace }}/rust/Cargo.toml
-
-jobs:
- fmt:
- name: Rustfmt
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v4
- - uses: ./.github/actions/rust-toolchain-setup
- - name: vendor onnxruntime source
- run: just vendor
- - name: fmt
- run: cargo fmt --all -- --check
-
- download:
- name: Download prebuilt ONNX Runtime archive from build.rs
- runs-on: ubuntu-latest
- env:
- ORT_RUST_STRATEGY: download
- steps:
- - uses: actions/checkout@v4
- - uses: ./.github/actions/rust-toolchain-setup
- - run: rustup target install x86_64-unknown-linux-gnu
- - run: rustup target install x86_64-apple-darwin
- - run: rustup target install i686-pc-windows-msvc
- - run: rustup target install x86_64-pc-windows-msvc
- # ******************************************************************
- - name: Download prebuilt archive (CPU, x86_64-unknown-linux-gnu)
- run: cargo build --target x86_64-unknown-linux-gnu --manifest-path ${{ env.MANIFEST_PATH }}
- - name: Verify prebuilt archive downloaded (CPU, x86_64-unknown-linux-gnu)
- run: ls -lh target/x86_64-unknown-linux-gnu/debug/build/onnxruntime-sys-*/out/onnxruntime-linux-x64-1.*.tgz
- # ******************************************************************
- - name: Download prebuilt archive (CPU, x86_64-apple-darwin)
- run: cargo build --target x86_64-apple-darwin --manifest-path ${{ env.MANIFEST_PATH }}
- - name: Verify prebuilt archive downloaded (CPU, x86_64-apple-darwin)
- run: ls -lh target/x86_64-apple-darwin/debug/build/onnxruntime-sys-*/out/onnxruntime-osx-x64-1.*.tgz
- # ******************************************************************
- - name: Download prebuilt archive (CPU, i686-pc-windows-msvc)
- run: cargo build --target i686-pc-windows-msvc --manifest-path ${{ env.MANIFEST_PATH }}
- - name: Verify prebuilt archive downloaded (CPU, i686-pc-windows-msvc)
- run: ls -lh target/i686-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-x86-1.*.zip
- # ******************************************************************
- - name: Download prebuilt archive (CPU, x86_64-pc-windows-msvc)
- run: cargo build --target x86_64-pc-windows-msvc --manifest-path ${{ env.MANIFEST_PATH }}
- - name: Verify prebuilt archive downloaded (CPU, x86_64-pc-windows-msvc)
- run: ls -lh target/x86_64-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-x64-1.*.zip
- # ******************************************************************
- - name: Download prebuilt archive (GPU, x86_64-unknown-linux-gnu)
- env:
- ORT_USE_CUDA: "yes"
- run: cargo build --target x86_64-unknown-linux-gnu --manifest-path ${{ env.MANIFEST_PATH }}
- - name: Verify prebuilt archive downloaded (GPU, x86_64-unknown-linux-gnu)
- run: ls -lh target/x86_64-unknown-linux-gnu/debug/build/onnxruntime-sys-*/out/onnxruntime-linux-x64-gpu-1.*.tgz
- # ******************************************************************
- - name: Download prebuilt archive (GPU, x86_64-pc-windows-msvc)
- env:
- ORT_USE_CUDA: "yes"
- run: cargo build --target x86_64-pc-windows-msvc --manifest-path ${{ env.MANIFEST_PATH }}
- - name: Verify prebuilt archive downloaded (GPU, x86_64-pc-windows-msvc)
- run: ls -lh target/x86_64-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-gpu-x64-1.*.zip
-
- test:
- name: Test Suite
- runs-on: ${{ matrix.os }}
- strategy:
- fail-fast: false
- matrix:
- target:
- [
- x86_64-unknown-linux-gnu,
- x86_64-apple-darwin,
- x86_64-pc-windows-msvc,
- i686-pc-windows-msvc,
- ]
- include:
- - target: x86_64-unknown-linux-gnu
- os: ubuntu-latest
- - target: x86_64-apple-darwin
- os: macos-latest
- - target: x86_64-pc-windows-msvc
- os: windows-latest
- - target: i686-pc-windows-msvc
- os: windows-latest
- env:
- CARGO_BUILD_TARGET: ${{ matrix.target }}
- steps:
- - uses: actions/checkout@v4
- - uses: ./.github/actions/rust-toolchain-setup
- - name: vendor onnxruntime source
- run: just vendor
- - run: rustup target install ${{ matrix.target }}
- - name: Install additional packages (macOS)
- if: contains(matrix.target, 'x86_64-apple-darwin')
- run: brew install libomp
- - name: Build (cargo build)
- run: cargo build --all --manifest-path ${{ env.MANIFEST_PATH }}
- - name: Build tests (cargo test)
- run: cargo test --no-run --manifest-path ${{ env.MANIFEST_PATH }}
- - name: Build onnxruntime with 'model-fetching' feature
- run: cargo build --manifest-path ${{ env.MANIFEST_PATH }} --features model-fetching
- - name: Test onnxruntime-sys
- run: cargo build --package onnxruntime-sys -- --test-threads=1 --nocapture
- - name: Test onnxruntime
- run: cargo test --manifest-path ${{ env.MANIFEST_PATH }} --features model-fetching -- --test-threads=1 --nocapture
-
- clippy:
- name: Clippy
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v4
- - uses: ./.github/actions/rust-toolchain-setup
- - name: vendor onnxruntime source
- run: just vendor
- - run: clippy --all-features --manifest-path ${{ env.MANIFEST_PATH }} -- -D warnings
-
- package-sys:
- name: Package onnxruntime-sys
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v4
- - uses: ./.github/actions/rust-toolchain-setup
- - name: vendor onnxruntime source
- run: just vendor
- - run: cargo package --allow-dirty --package onnxruntime-sys
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 3ef5076583001..181f3fb17d332 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,7 +13,7 @@ jobs:
issues: write
pull-requests: write
steps:
- - uses: actions/stale@v8.0.0
+ - uses: actions/stale@v8
with:
# Comma separated list of labels that can be assigned to issues to exclude them from being marked as stale
exempt-issue-labels: contributions welcome, feature request, regression
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 3a780f87d2300..b77e48942ec44 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -21,12 +21,12 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: false
- - uses: actions/setup-python@v4
+ - uses: actions/setup-python@v5
with:
python-version: '3.11.x'
architecture: 'x64'
- - uses: actions/setup-node@v3
+ - uses: actions/setup-node@v4
with:
node-version: 18
@@ -41,38 +41,4 @@ jobs:
# The build machine doesn't have a GPU. So the value of CMAKE_CUDA_ARCHITECTURES doesn't matter.
- name: Build code
- run: python tools\ci_build\build.py --windows_sdk_version 10.0.22621.0 --enable_training --build_java --config Debug --build_dir D:\b --skip_submodule_sync --build_csharp --update --build --parallel --cmake_generator "Visual Studio 17 2022" --build_shared_lib --enable_pybind --use_cuda --cuda_home=${{ github.workspace }}\cuda_sdk\v12.2 --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
-
- Onnxruntime-TVM:
- runs-on: windows-latest
- steps:
- - uses: actions/checkout@v4
- with:
- submodules: true
- - uses: conda-incubator/setup-miniconda@v2
- with:
- activate-environment: "ort_build"
- python-version: 3.8
- - name: 'Install LLVM-Dev'
- shell: pwsh
- run: |
- conda install llvmdev=12.0.0
- conda info
- conda list
- - name: 'Add LLVM-Dev binaries to the PATH'
- run: |
- echo "C:/Miniconda/Library/bin" >> $GITHUB_PATH
- - name: 'Setup TVM EP Python requirements'
- run: |
- python3 -m pip install -r ${{ github.workspace }}/tools/ci_build/github/linux/tvm/requirements.txt
- - name: 'rm gtest in conda'
- shell: pwsh
- run: |
- Remove-Item 'C:\Miniconda\Library\lib\cmake\gtest' -Recurse -Force
- Remove-Item 'C:\Miniconda\Library\lib\gmock.lib' -Force
- Remove-Item 'C:\Miniconda\Library\lib\gmock_main.lib' -Force
- Remove-Item 'C:\Miniconda\Library\lib\gtest.lib' -Force
- Remove-Item 'C:\Miniconda\Library\lib\gtest_main.lib' -Force
- - name: 'Build and Test'
- run: |
- python3 ${{ github.workspace }}/tools/ci_build/build.py --build_dir build --config Release --skip_submodule_sync --parallel --enable_pybind --disable_contrib_ops --disable_ml_ops --skip_onnx_tests --use_tvm
+ run: python tools\ci_build\build.py --windows_sdk_version 10.0.22621.0 --enable_training --build_java --config Debug --build_dir D:\b --skip_submodule_sync --build_csharp --update --build --parallel --cmake_generator "Visual Studio 17 2022" --build_shared_lib --enable_pybind --use_cuda --cuda_home=${{ github.workspace }}\cuda_sdk\v12.2 --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
diff --git a/.gitmodules b/.gitmodules
index 7bb49e98bfec1..29ca8821f8eb8 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,4 +7,4 @@
[submodule "cmake/external/emsdk"]
path = cmake/external/emsdk
url = https://github.com/emscripten-core/emsdk.git
- branch = 3.1.44
+ branch = 3.1.59
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 4e5d077b08ff4..ad782079bf76e 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -130,8 +130,10 @@ exclude_patterns = [
'js/**',
'onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/**', # Contains data chunks
'onnxruntime/core/flatbuffers/schema/*.fbs.h', # Generated code
+ 'onnxruntime/test/flatbuffers/*.fbs.h', # Generated code
'onnxruntime/core/graph/contrib_ops/quantization_defs.cc',
'onnxruntime/core/mlas/**', # Contains assembly code
+ 'onnxruntime/core/mickey/cutlass_ext/**', # CUTLASS lib recommends NO automatic code formatting
'winml/lib/Api.Image/shaders/**', # Contains data chunks
]
command = [
diff --git a/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml b/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml
index b9de1b79e1d51..fd3b7266d30f7 100644
--- a/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml
+++ b/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml
@@ -29,6 +29,8 @@ extends:
git:
submodules: false
globalSdl: # https://aka.ms/obpipelines/sdl
+ asyncSdl:
+ enabled: false
tsa:
enabled: true
prefast:
@@ -53,10 +55,6 @@ extends:
BuildArch: x86
PythonPackageName: pythonx86
- - template: .pipelines/windowsai-steps.yml@self
- parameters:
- BuildArch: arm
-
- template: .pipelines/windowsai-steps.yml@self
parameters:
BuildArch: arm64
@@ -72,11 +70,6 @@ extends:
PythonPackageName: pythonx86
Runtime: static
- - template: .pipelines/windowsai-steps.yml@self
- parameters:
- BuildArch: arm
- Runtime: static
-
- template: .pipelines/windowsai-steps.yml@self
parameters:
BuildArch: arm64
@@ -94,11 +87,9 @@ extends:
dependsOn:
- Windows_Packaging_x64_dynamic
- Windows_Packaging_x86_dynamic
- - Windows_Packaging_arm_dynamic
- Windows_Packaging_arm64_dynamic
- Windows_Packaging_x64_static
- Windows_Packaging_x86_static
- - Windows_Packaging_arm_static
- Windows_Packaging_arm64_static
condition: succeeded()
steps:
@@ -120,12 +111,6 @@ extends:
artifactName: 'drop_Windows_Build_Windows_Packaging_arm64_dynamic'
targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm64'
- - task: DownloadPipelineArtifact@0
- displayName: 'Download Pipeline Artifact - NuGet DirectML arm'
- inputs:
- artifactName: 'drop_Windows_Build_Windows_Packaging_arm_dynamic'
- targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm'
-
- task: DownloadPipelineArtifact@0
displayName: 'Download Pipeline Artifact - NuGet DirectML x64 StaticRuntime'
inputs:
@@ -144,12 +129,6 @@ extends:
artifactName: 'drop_Windows_Build_Windows_Packaging_arm64_static'
targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm64-static-runtime'
- - task: DownloadPipelineArtifact@0
- displayName: 'Download Pipeline Artifact - NuGet DirectML arm StaticRuntime'
- inputs:
- artifactName: 'drop_Windows_Build_Windows_Packaging_arm_static'
- targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm-static-runtime'
-
- task: PowerShell@2
displayName: 'Bundle NuGet and other binaries'
inputs:
@@ -194,17 +173,7 @@ extends:
$arm64_static_runtime_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm64_static_runtime_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm64_static_runtime_nuget_package))
[System.IO.Compression.ZipFile]::ExtractToDirectory($arm64_static_runtime_nuget_package, $arm64_static_runtime_nupkg_unzipped_directory)
- $nupkgs = (Get-ChildItem ..\nuget-artifact-arm -Filter Microsoft.AI.MachineLearning*.nupkg -Recurse)
- $arm_nuget_package = $nupkgs[0].FullName
- $arm_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName
- $arm_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm_nuget_package))
- [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_nuget_package, $arm_nupkg_unzipped_directory)
-
- $nupkgs = (Get-ChildItem ..\nuget-artifact-arm-static-runtime -Filter Microsoft.AI.MachineLearning*.nupkg -Recurse)
- $arm_static_runtime_nuget_package = $nupkgs[0].FullName
- $arm_static_runtime_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName
- $arm_static_runtime_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_static_runtime_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm_static_runtime_nuget_package))
- [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_static_runtime_nuget_package, $arm_static_runtime_nupkg_unzipped_directory)
+
$x64_static_runtime_path_old = [System.IO.Path]::Combine($x64_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-x64', '_native')
$x64_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-x64', '_native', 'static')
@@ -216,10 +185,7 @@ extends:
$arm64_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
$arm64_static_runtime_path_old = [System.IO.Path]::Combine($arm64_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
$arm64_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native', 'static')
- $arm_runtime_path_old = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
- $arm_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
- $arm_static_runtime_path_old = [System.IO.Path]::Combine($arm_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
- $arm_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native', 'static')
+
$uap_build_path_old = [System.IO.Path]::Combine($x64_static_runtime_nupkg_unzipped_directory, 'build', 'native')
$uap_build_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'build', 'uap10.0')
@@ -228,8 +194,6 @@ extends:
New-Item -Path $x86_static_runtime_path_new -ItemType Directory
New-Item -Path $arm64_runtime_path_new -ItemType Directory
New-Item -Path $arm64_static_runtime_path_new -ItemType Directory
- New-Item -Path $arm_runtime_path_new -ItemType Directory
- New-Item -Path $arm_static_runtime_path_new -ItemType Directory
Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.dll')) $x86_runtime_path_new
Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.lib')) $x86_runtime_path_new
@@ -241,11 +205,6 @@ extends:
Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.dll')) $arm64_runtime_path_new
Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.lib')) $arm64_runtime_path_new
- Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.dll')) $arm_runtime_path_new
- Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.lib')) $arm_runtime_path_new
- Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.dll')) $arm_runtime_path_new
- Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.lib')) $arm_runtime_path_new
-
Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'onnxruntime.dll')) ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'onnxruntime.dll'))
Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'onnxruntime.lib')) ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'onnxruntime.lib'))
Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'microsoft.ai.machinelearning.dll')) ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'microsoft.ai.machinelearning.dll'))
@@ -261,11 +220,6 @@ extends:
Copy-Item ([System.IO.Path]::Combine($arm64_static_runtime_path_old, 'microsoft.ai.machinelearning.dll')) ([System.IO.Path]::Combine($arm64_static_runtime_path_new, 'microsoft.ai.machinelearning.dll'))
Copy-Item ([System.IO.Path]::Combine($arm64_static_runtime_path_old, 'microsoft.ai.machinelearning.lib')) ([System.IO.Path]::Combine($arm64_static_runtime_path_new, 'microsoft.ai.machinelearning.lib'))
- Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'onnxruntime.dll')) ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'onnxruntime.dll'))
- Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'onnxruntime.lib')) ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'onnxruntime.lib'))
- Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'microsoft.ai.machinelearning.dll')) ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'microsoft.ai.machinelearning.dll'))
- Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'microsoft.ai.machinelearning.lib')) ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'microsoft.ai.machinelearning.lib'))
-
Copy-Item -Recurse $uap_build_path_old $uap_build_path_new
$merged_nuget_path = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'merged')
@@ -304,22 +258,13 @@ extends:
$arm64_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm64_nupkg_unzipped_directory_root, 'symbols', [System.IO.Path]::GetFileNameWithoutExtension($arm64_nuget_package))
[System.IO.Compression.ZipFile]::ExtractToDirectory($arm64_nuget_package, $arm64_nupkg_unzipped_directory)
- $nupkgs = (Get-ChildItem ..\nuget-artifact-arm -Filter Microsoft.AI.MachineLearning*.snupkg -Recurse)
- $arm_nuget_package = $nupkgs[0].FullName
- $arm_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName
- $arm_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory_root, 'symbols', [System.IO.Path]::GetFileNameWithoutExtension($arm_nuget_package))
- [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_nuget_package, $arm_nupkg_unzipped_directory)
-
$x86_runtime_path_old = [System.IO.Path]::Combine($x86_nupkg_unzipped_directory, 'runtimes', 'win-x86', '_native')
$x86_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-x86', '_native')
$arm64_runtime_path_old = [System.IO.Path]::Combine($arm64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
$arm64_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
- $arm_runtime_path_old = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
- $arm_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
-
+
New-Item -Path $x86_runtime_path_new -ItemType Directory
New-Item -Path $arm64_runtime_path_new -ItemType Directory
- New-Item -Path $arm_runtime_path_new -ItemType Directory
Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.pdb')) $x86_runtime_path_new
Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'microsoft.ai.machinelearning.pdb')) $x86_runtime_path_new
@@ -327,9 +272,6 @@ extends:
Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'onnxruntime.pdb')) $arm64_runtime_path_new
Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.pdb')) $arm64_runtime_path_new
- Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.pdb')) $arm_runtime_path_new
- Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.pdb')) $arm_runtime_path_new
-
$merged_nuget_path = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'merged')
if (!(Test-Path $merged_nuget_path)) {
New-Item -Path $merged_nuget_path -ItemType Directory
diff --git a/.pipelines/nuget_config/x64/packages.config b/.pipelines/nuget_config/x64/packages.config
index 2ac650b0e6dc9..9066e13ee1c8d 100644
--- a/.pipelines/nuget_config/x64/packages.config
+++ b/.pipelines/nuget_config/x64/packages.config
@@ -1,6 +1,6 @@
-
+
diff --git a/.pipelines/nuget_config/x86/packages.config b/.pipelines/nuget_config/x86/packages.config
index f80f96194a230..a8e5b35b28b36 100644
--- a/.pipelines/nuget_config/x86/packages.config
+++ b/.pipelines/nuget_config/x86/packages.config
@@ -1,6 +1,6 @@
-
+
diff --git a/.pipelines/windowsai-steps.yml b/.pipelines/windowsai-steps.yml
index 292ce60c6b6cf..855573de753b0 100644
--- a/.pipelines/windowsai-steps.yml
+++ b/.pipelines/windowsai-steps.yml
@@ -80,11 +80,11 @@ jobs:
# must call vsdevcmd first to add cmake to PATH
- script: |
- curl -O -L https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-windows-x86_64.zip
- 7z x cmake-3.26.3-windows-x86_64.zip
+ curl -O -L https://github.com/Kitware/CMake/releases/download/v3.28.3/cmake-3.28.3-windows-x86_64.zip
+ 7z x cmake-3.28.3-windows-x86_64.zip
set PYTHONHOME=$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools
set PYTHONPATH=$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools
- $(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools\python.exe "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 17 2022" --update --config RelWithDebInfo --enable_lto --use_telemetry --disable_rtti --enable_wcos $(BuildFlags) --cmake_extra_defines "CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" "CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" CMAKE_SYSTEM_VERSION=10.0.19041.0 --cmake_path $(Build.BinariesDirectory)\cmake-3.26.3-windows-x86_64\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake-3.26.3-windows-x86_64\bin\ctest.exe
+ $(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools\python.exe "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --parallel --use_binskim_compliant_compile_flags --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 17 2022" --update --config RelWithDebInfo --enable_lto --use_telemetry --disable_rtti --enable_wcos --windows_sdk_version "10.0.22621.0" $(BuildFlags) --cmake_extra_defines "CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" "CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" --cmake_path $(Build.BinariesDirectory)\cmake-3.28.3-windows-x86_64\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake-3.28.3-windows-x86_64\bin\ctest.exe
workingDirectory: '$(Build.BinariesDirectory)'
displayName: 'Generate cmake config'
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 2f2adc78f6de9..98d23090fd474 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -11,7 +11,7 @@
// Auto sort imports
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
- "source.organizeImports": true
+ "source.organizeImports": "explicit"
},
"editor.defaultFormatter": "ms-python.black-formatter"
},
@@ -21,5 +21,8 @@
"cpplint.filters": [
"-build/include_subdir",
"-runtime/references"
- ]
+ ],
+ "files.associations": {
+ "span": "cpp"
+ }
}
diff --git a/CITATION.cff b/CITATION.cff
index 82bcac5a7b750..10b7290022aef 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -3,8 +3,7 @@ title: ONNX Runtime
message: "Please use this information to cite ONNX Runtime in
research or other publications."
authors:
- - affiliation: Microsoft Corporation
- given-names: ONNX Runtime developers
+ - name: ONNX Runtime developers
date-released: 2018-11-29
url: "https://onnxruntime.ai"
repository-code: "https://github.com/microsoft/onnxruntime"
diff --git a/README.md b/README.md
index 33bce867e3bde..24c3e191c115b 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
* **General Information**: [onnxruntime.ai](https://onnxruntime.ai)
-* **Usage documention and tutorials**: [onnxruntime.ai/docs](https://onnxruntime.ai/docs)
+* **Usage documentation and tutorials**: [onnxruntime.ai/docs](https://onnxruntime.ai/docs)
* **YouTube video tutorials**: [youtube.com/@ONNXRuntime](https://www.youtube.com/@ONNXRuntime)
diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt
index 700206180decd..8ec770da22159 100644
--- a/ThirdPartyNotices.txt
+++ b/ThirdPartyNotices.txt
@@ -1829,7 +1829,7 @@ Zbigniew Skowron
_____
-HalidelR
+HalideIR
Copyright (c) 2016 HalideIR contributors
Copyright (c) 2012-2014 MIT CSAIL, Google Inc., and other contributors
@@ -6299,3 +6299,210 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
+
+_____
+
+neural-speed
+
+https://github.com/intel/neural-speed
+
+ Apache License
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ ============================================================================
+
+ Copyright 2016-2019 Intel Corporation
+ Copyright 2018 YANDEX LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+ This distribution includes third party software ("third party programs").
+ This third party software, even if included with the distribution of
+ the Intel software, may be governed by separate license terms, including
+ without limitation, third party license terms, other Intel software license
+ terms, and open source software license terms. These separate license terms
+ govern your use of the third party programs as set forth in the
+ "THIRD-PARTY-PROGRAMS" file.
diff --git a/VERSION_NUMBER b/VERSION_NUMBER
index 092afa15df4df..815d5ca06d530 100644
--- a/VERSION_NUMBER
+++ b/VERSION_NUMBER
@@ -1 +1 @@
-1.17.0
+1.19.0
diff --git a/build_arm64x.bat b/build_arm64x.bat
index fbcdd373086a9..1ed268ae94a43 100644
--- a/build_arm64x.bat
+++ b/build_arm64x.bat
@@ -5,7 +5,6 @@
setlocal
set PATH=C:\Program Files\Git\usr\bin;%PATH%
-set LINK_REPRO_NAME=/mylink.rsp
rem Requires a Python install to be available in your PATH
python "%~dp0\tools\ci_build\build.py" --arm64 --buildasx --build_dir "%~dp0\build\arm64-x" %*
diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json
index e8dbc9cf9eff6..cf245e63a3a5d 100644
--- a/cgmanifests/cgmanifest.json
+++ b/cgmanifests/cgmanifest.json
@@ -469,7 +469,7 @@
"type": "pip",
"pip": {
"Name": "transformers",
- "Version": "2.11.0"
+ "Version": "4.36.0"
},
"comments": "Installed in the training docker image"
}
@@ -570,7 +570,7 @@
"git": {
"commitHash": "e7248b26a1ed53fa030c5c459f7ea095dfd276ac",
"repositoryUrl": "https://gitlab.com/libeigen/eigen.git"
- }
+ }
}
}
],
diff --git a/cgmanifests/generate_cgmanifest.py b/cgmanifests/generate_cgmanifest.py
index 81181d3ccfb20..3cecbb0cc977f 100644
--- a/cgmanifests/generate_cgmanifest.py
+++ b/cgmanifests/generate_cgmanifest.py
@@ -115,8 +115,8 @@ def normalize_path_separators(path):
submodule_lines = proc.stdout.splitlines()
for submodule_line in submodule_lines:
(absolute_path, url, commit) = submodule_line.split(" ")
- git_deps[GitDep(commit, url)] = "git submodule at {}".format(
- normalize_path_separators(os.path.relpath(absolute_path, REPO_DIR))
+ git_deps[GitDep(commit, url)] = (
+ f"git submodule at {normalize_path_separators(os.path.relpath(absolute_path, REPO_DIR))}"
)
with open(os.path.join(SCRIPT_DIR, "..", "cmake", "deps.txt")) as f:
diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index 137ea8a50c011..78db7d735dad9 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -6,7 +6,7 @@
"component": {
"type": "git",
"git": {
- "commitHash": "a896e3d066448b3530dbcaa48869fafefd738f57",
+ "commitHash": "d52c46520124845b1e0e0525f2759299d840143f",
"repositoryUrl": "https://github.com/emscripten-core/emsdk.git"
},
"comments": "git submodule at cmake/external/emsdk"
@@ -26,7 +26,7 @@
"component": {
"type": "git",
"git": {
- "commitHash": "b86cc54efce19530fb953e4b21f57e6b3888534c",
+ "commitHash": "595228d99e3977ac27cb79d5963adda262af99ad",
"repositoryUrl": "https://github.com/onnx/onnx.git"
},
"comments": "git submodule at cmake/external/onnx"
@@ -36,12 +36,22 @@
"component": {
"type": "git",
"git": {
- "commitHash": "dcd5bd5fd593e31465af3d9ef291d26c646b0a4f",
+ "commitHash": "4a2c63365eff8823a5221db86ef490e828306f9d",
"repositoryUrl": "https://github.com/abseil/abseil-cpp.git"
},
"comments": "abseil_cpp"
}
},
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "dbb0094fd0cb936469e35320bf37e866ef7a1da4",
+ "repositoryUrl": "https://github.com/apple/coremltools.git"
+ },
+ "comments": "coremltools"
+ }
+ },
{
"component": {
"type": "git",
@@ -76,7 +86,7 @@
"component": {
"type": "git",
"git": {
- "commitHash": "6df40a2471737b27271bdd9b900ab5f3aec746c7",
+ "commitHash": "0100f6a5779831fa7a651e4b67ef389a8752bd9b",
"repositoryUrl": "https://github.com/google/flatbuffers.git"
},
"comments": "flatbuffers"
@@ -106,7 +116,7 @@
"component": {
"type": "git",
"git": {
- "commitHash": "361e8d1cfe0c6c36d30b39f1b61302ece5507320",
+ "commitHash": "344117638c8ff7e239044fd0fa7085839fc03021",
"repositoryUrl": "https://github.com/google/benchmark.git"
},
"comments": "google_benchmark"
@@ -196,7 +206,17 @@
"component": {
"type": "git",
"git": {
- "commitHash": "a43ce67187bab219520fd80f21af8bbd4354bc8c",
+ "commitHash": "150e7527d5286ddd3a995c228dedf8d76a7a86bc",
+ "repositoryUrl": "https://github.com/intel/neural-speed.git"
+ },
+ "comments": "neural_speed"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "06adf4461ac84035bee658c6cf5df39f7ab6071d",
"repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git"
},
"comments": "onnx_tensorrt"
@@ -256,7 +276,7 @@
"component": {
"type": "git",
"git": {
- "commitHash": "5723bb8950318135ed9cf4fc76bed988a087f536",
+ "commitHash": "2b354c6ad0d0479dcff68dab23fb0d1143a482c2",
"repositoryUrl": "https://github.com/google/re2.git"
},
"comments": "re2"
@@ -321,6 +341,16 @@
},
"comments": "composable_kernel"
}
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "de28d93dfa9ebf3e473127c1c657e1920a5345ee",
+ "repositoryUrl": "https://github.com/microsoft/DirectX-Headers.git"
+ },
+ "comments": "directx_headers"
+ }
}
]
}
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 7494035e4784e..5200b447d553f 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -59,8 +59,8 @@ if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose build type: Debug Release RelWithDebInfo MinSizeRel." FORCE)
endif()
-if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 8)
- message(FATAL_ERROR "GCC version must be greater than or equal to 8")
+if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 9)
+ message(FATAL_ERROR "GCC version must be greater than or equal to 9")
endif()
# Options
@@ -76,9 +76,10 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
# Enable ONNX Runtime CUDA EP's internal unit tests that directly access the EP's internal functions instead of through
# OpKernels. When the option is ON, we will have two copies of GTest library in the same process. It is not a typical
# use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead.
-cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF)
+cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS" OFF)
option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF)
+option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF)
option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF)
option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
option(onnxruntime_USE_COREML "Build with CoreML support" OFF)
@@ -87,6 +88,7 @@ option(onnxruntime_USE_QNN "Build with QNN support" OFF)
option(onnxruntime_USE_SNPE "Build with SNPE support" OFF)
option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
+option(onnxruntime_USE_NEURAL_SPEED "Build with Neural Speed support" OFF)
option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
@@ -115,9 +117,7 @@ option(onnxruntime_CROSS_COMPILING "Cross compiling onnx runtime" OFF)
option(onnxruntime_GCOV_COVERAGE "Compile with options necessary to run code coverage" OFF)
option(onnxruntime_DONT_VECTORIZE "Do not vectorize operations in Eigen" OFF)
-#It's preferred to turn it OFF when onnxruntime is dynamically linked to PROTOBUF. But Tensort always required the full version of protobuf.
-cmake_dependent_option(onnxruntime_USE_FULL_PROTOBUF "Link to libprotobuf instead of libprotobuf-lite when this option is ON" OFF "NOT onnxruntime_USE_TENSORRT" ON)
-option(tensorflow_C_PACKAGE_PATH "Path to tensorflow C package installation dir")
+option(onnxruntime_USE_FULL_PROTOBUF "Link to libprotobuf instead of libprotobuf-lite when this option is ON" OFF)
option(onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS "Enable operator implemented in language other than cpp" OFF)
option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS "Dump debug information about node inputs and outputs when executing the model." OFF)
cmake_dependent_option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB "Build dump debug information about node inputs and outputs with support for sql database." OFF "onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS" OFF)
@@ -129,6 +129,7 @@ option(onnxruntime_USE_ACL_1902 "Build with ACL version 1902 support" OFF)
option(onnxruntime_USE_ACL_1905 "Build with ACL version 1905 support" OFF)
option(onnxruntime_USE_ACL_1908 "Build with ACL version 1908 support" OFF)
option(onnxruntime_USE_ACL_2002 "Build with ACL version 2002 support" OFF)
+option(onnxruntime_USE_ACL_2308 "Build with ACL version 2308 support" OFF)
option(onnxruntime_USE_ARMNN "Build with ArmNN support" OFF)
option(onnxruntime_ARMNN_RELU_USE_CPU "Use the CPU implementation for the Relu operator for the ArmNN EP" ON)
option(onnxruntime_ARMNN_BN_USE_CPU "Use the CPU implementation for the Batch Normalization operator for the ArmNN EP" ON)
@@ -322,17 +323,29 @@ if (onnxruntime_USE_ROCM)
endif()
# replicate strategy used by pytorch to get ROCM_VERSION
- # https://github.com/pytorch/pytorch/blob/8eb21488fdcdb8b0e6fa2e46179b5fa6c42e75af/cmake/public/LoadHIP.cmake#L153-L173
- file(READ "${onnxruntime_ROCM_HOME}/.info/version-dev" ROCM_VERSION_DEV_RAW)
- string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW})
- if (ROCM_VERSION_DEV_MATCH)
+ # https://github.com/pytorch/pytorch/blob/5c5b71b6eebae76d744261715231093e62f0d090/cmake/public/LoadHIP.cmake
+ # with modification
+ if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version")
+ file(READ "${onnxruntime_ROCM_HOME}/.info/version" ROCM_VERSION_DEV_RAW)
+ string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_MATCH ${ROCM_VERSION_DEV_RAW})
+ elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm_version.h")
+ file(READ "${onnxruntime_ROCM_HOME}/include/rocm_version.h" ROCM_VERSION_H_RAW)
+ string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
+ elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h")
+ file(READ "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h" ROCM_VERSION_H_RAW)
+ string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
+ endif()
+
+ if (ROCM_VERSION_MATCH)
set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1})
set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3})
set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")
+ else()
+ message(FATAL_ERROR "Cannot determine ROCm version string")
endif()
- message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version-dev ****\n")
+ message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version ****\n")
message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
@@ -352,13 +365,7 @@ if (onnxruntime_USE_ROCM)
endif()
endif()
-if (APPLE)
- if (NOT CMAKE_OSX_ARCHITECTURES)
- message("Building ONNX Runtime for ${CMAKE_HOST_SYSTEM_PROCESSOR}")
- endif()
-elseif (NOT WIN32 AND NOT APPLE)
- message("Building ONNX Runtime for ${CMAKE_SYSTEM_PROCESSOR}")
-endif()
+
# Single output director for all binaries
set(RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE PATH "Single output directory for all binaries.")
@@ -376,7 +383,7 @@ function(set_msvc_c_cpp_compiler_warning_level warning_level)
get_property(opts DIRECTORY PROPERTY COMPILE_OPTIONS)
# only match the generator expression added by this function
list(FILTER opts
- EXCLUDE REGEX "^\\$<\\$,\\$>:/W[0-4]>$")
+ EXCLUDE REGEX "^\\$<\\$:/W[0-4]>$")
list(APPEND opts "$<$:${warning_flag}>")
set_property(DIRECTORY PROPERTY COMPILE_OPTIONS "${opts}")
endif()
@@ -491,6 +498,14 @@ endif()
include(adjust_global_compile_flags.cmake)
+if (APPLE)
+ if (NOT CMAKE_OSX_ARCHITECTURES)
+ message("Building ONNX Runtime for ${CMAKE_HOST_SYSTEM_PROCESSOR} CPU ARCH")
+ endif()
+elseif (NOT WIN32 AND NOT APPLE)
+ message("Building ONNX Runtime for ${onnxruntime_target_platform} CPU ARCH")
+endif()
+
# We need to link with libatomic on systems that do not have built-in atomics, or
# don't have built-in support for 8 byte atomics
# Derived from https://github.com/protocolbuffers/protobuf/blob/master/cmake/CMakeLists.txt
@@ -636,8 +651,24 @@ else()
check_cxx_compiler_flag(-Wunused-but-set-variable HAS_UNUSED_BUT_SET_VARIABLE)
check_cxx_compiler_flag(-Wunused-variable HAS_UNUSED_VARIABLE)
check_cxx_compiler_flag(-Wuseless-cast HAS_USELESS_CAST)
+ check_cxx_compiler_flag(-Wstringop-overflow HAS_STRINGOP_OVERFLOW)
+ if(onnxruntime_ENABLE_TRAINING_APIS)
+ check_cxx_compiler_flag(-Wdangling-reference HAS_DANGLING_REFERENCE)
+ if(HAS_DANGLING_REFERENCE)
+ list(APPEND ORT_WARNING_FLAGS -Wno-dangling-reference)
+ endif()
+ endif()
check_function_exists(reallocarray HAS_REALLOCARRAY)
-
+ if (NOT APPLE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND onnxruntime_target_platform STREQUAL "aarch64")
+ check_cxx_compiler_flag(-march=armv8.2-a+bf16 HAS_ARM64_BFLOAT16)
+ if(NOT HAS_ARM64_BFLOAT16)
+ message(FATAL_ERROR "The compiler doesn't support BFLOAT16!!!")
+ endif()
+ check_cxx_compiler_flag(-march=armv8.2-a+fp16 HAS_ARM64_FLOAT16)
+ if(NOT HAS_ARM64_FLOAT16)
+ message(FATAL_ERROR "The compiler doesn't support FLOAT16!!!")
+ endif()
+ endif()
if (HAS_TAUTOLOGICAL_POINTER_COMPARE)
#we may have extra null pointer checkings in debug build, it's not an issue
list(APPEND ORT_WARNING_FLAGS -Wno-tautological-pointer-compare)
@@ -701,6 +732,9 @@ if (onnxruntime_USE_CUDA)
set(onnxruntime_USE_FLASH_ATTENTION OFF)
set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
endif()
+ if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.4)
+ message( FATAL_ERROR "Failed build due to CUDA compiler version < 11.4")
+ endif()
else()
set(onnxruntime_USE_FLASH_ATTENTION OFF)
set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
@@ -721,8 +755,8 @@ if (onnxruntime_USE_CUDA)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_MEMORY_EFFICIENT_ATTENTION=1)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_MEMORY_EFFICIENT_ATTENTION=1)
endif()
-
endif()
+
if (onnxruntime_USE_VITISAI)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_VITISAI=1)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_VITISAI=1)
@@ -763,6 +797,40 @@ if (onnxruntime_USE_QNN)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_QNN=1)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_QNN=1)
list(APPEND ONNXRUNTIME_PROVIDER_NAMES qnn)
+ if (NOT QNN_ARCH_ABI)
+ string(TOLOWER ${onnxruntime_target_platform} GEN_PLATFORM)
+ if(MSVC)
+ message(STATUS "Building MSVC for architecture ${CMAKE_SYSTEM_PROCESSOR} with CMAKE_GENERATOR_PLATFORM as ${GEN_PLATFORM}")
+ if (${GEN_PLATFORM} STREQUAL "arm64")
+ set(QNN_ARCH_ABI aarch64-windows-msvc)
+ elseif (${GEN_PLATFORM} STREQUAL "arm64ec")
+ set(QNN_ARCH_ABI arm64x-windows-msvc)
+ else()
+ set(QNN_ARCH_ABI x86_64-windows-msvc)
+ endif()
+ else()
+ if (${CMAKE_SYSTEM_NAME} STREQUAL "Android")
+ set(QNN_ARCH_ABI aarch64-android-clang6.0)
+ elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+ if (${GEN_PLATFORM} STREQUAL "x86_64")
+ set(QNN_ARCH_ABI x86_64-linux-clang)
+ else()
+ set(QNN_ARCH_ABI aarch64-android)
+ endif()
+ endif()
+ endif()
+ endif()
+
+ if (MSVC OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+ file(GLOB QNN_LIB_FILES LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/libQnn*.so" "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/Qnn*.dll")
+ if (${QNN_ARCH_ABI} STREQUAL "aarch64-windows-msvc" OR ${QNN_ARCH_ABI} STREQUAL "arm64x-windows-msvc")
+ file(GLOB EXTRA_HTP_LIB LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/hexagon-v68/unsigned/libQnnHtpV68Skel.so"
+ "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so"
+ "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libqnnhtpv73.cat")
+ list(APPEND QNN_LIB_FILES ${EXTRA_HTP_LIB})
+ endif()
+ message(STATUS "QNN lib files: " ${QNN_LIB_FILES})
+ endif()
endif()
if (onnxruntime_USE_SNPE)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_SNPE=1)
@@ -788,6 +856,9 @@ if (onnxruntime_USE_DML)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_DML=1)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_DML=1)
list(APPEND ONNXRUNTIME_PROVIDER_NAMES dml)
+ if(onnxruntime_ENABLE_NPU_ADAPTER_ENUMERATION)
+ list(APPEND ORT_PROVIDER_FLAGS -DENABLE_NPU_ADAPTER_ENUMERATION=1)
+ endif()
endif()
if (onnxruntime_USE_MIGRAPHX)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_MIGRAPHX=1)
@@ -886,6 +957,11 @@ function(onnxruntime_set_compile_flags target_name)
if (onnxruntime_ENABLE_ATEN)
target_compile_definitions(${target_name} PRIVATE ENABLE_ATEN)
endif()
+
+ if(USE_NEURAL_SPEED)
+ target_compile_definitions(${target_name} PRIVATE ORT_NEURAL_SPEED)
+ endif()
+
set_target_properties(${target_name} PROPERTIES COMPILE_WARNING_AS_ERROR ON)
if (onnxruntime_USE_CUDA)
# Suppress a "conversion_function_not_usable" warning in gsl/span
@@ -965,9 +1041,12 @@ function(onnxruntime_set_compile_flags target_name)
foreach(FLAG ${ORT_WARNING_FLAGS})
target_compile_options(${target_name} PRIVATE "$<$:SHELL:--compiler-options ${FLAG}>")
endforeach()
- if ((NVCC_HAS_STRICT_ALIASING AND "${target_name}" MATCHES "cuda") OR (HAS_STRICT_ALIASING AND NOT "${target_name}" MATCHES "cuda"))
+ if (NVCC_HAS_STRICT_ALIASING AND "${target_name}" MATCHES "cuda")
target_compile_options(${target_name} PRIVATE "$<$:-Wno-strict-aliasing>")
endif()
+ if (HAS_STRICT_ALIASING AND NOT "${target_name}" MATCHES "cuda")
+ target_compile_options(${target_name} PRIVATE "$<$:-Wno-strict-aliasing>")
+ endif()
endif()
if (onnxruntime_USE_ROCM)
# flags are detected with CXX language mode, some flags are not supported with hipclang
@@ -1088,7 +1167,7 @@ function(onnxruntime_add_include_to_target dst_target)
endfunction()
# ACL
-if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905 OR onnxruntime_USE_ACL_1908 OR onnxruntime_USE_ACL_2002)
+if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905 OR onnxruntime_USE_ACL_1908 OR onnxruntime_USE_ACL_2002 OR onnxruntime_USE_ACL_2308)
set(onnxruntime_USE_ACL ON)
if (onnxruntime_USE_ACL_1902)
add_definitions(-DACL_1902=1)
@@ -1099,7 +1178,11 @@ if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905
if (onnxruntime_USE_ACL_2002)
add_definitions(-DACL_2002=1)
else()
- add_definitions(-DACL_1905=1)
+ if (onnxruntime_USE_ACL_2308)
+ add_definitions(-DACL_2308=1)
+ else()
+ add_definitions(-DACL_1905=1)
+ endif()
endif()
endif()
endif()
@@ -1166,6 +1249,13 @@ if (onnxruntime_USE_DNNL)
add_compile_definitions(DNNL_OPENMP)
endif()
+if (onnxruntime_USE_NEURAL_SPEED AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_USE_TVM)
+ include(neural_speed)
+ if (USE_NEURAL_SPEED)
+ list(APPEND onnxruntime_EXTERNAL_LIBRARIES neural_speed::bestla)
+ endif()
+endif()
+
# TVM EP
if (onnxruntime_USE_TVM)
if (NOT TARGET tvm)
@@ -1206,18 +1296,10 @@ if (onnxruntime_USE_TVM)
$)
set(onnxruntime_tvm_libs onnxruntime_providers_tvm)
-
- # needs to link with stdc++fs in Linux
- if (UNIX)
- if (NOT APPLE)
- set(FS_STDLIB stdc++fs)
- endif()
- endif()
- list(APPEND onnxruntime_EXTERNAL_LIBRARIES tvm ${FS_STDLIB})
+ list(APPEND onnxruntime_EXTERNAL_LIBRARIES tvm)
list(APPEND onnxruntime_EXTERNAL_DEPENDENCIES tvm)
endif()
-
# onnxruntime-extensions
if (onnxruntime_USE_EXTENSIONS)
include(extensions)
@@ -1226,11 +1308,7 @@ endif()
#Dependencies end. In the next we'll enable "treat warning as error"
#Adjust warning flags
-if (onnxruntime_USE_CUDA)
- set_msvc_c_cpp_compiler_warning_level(3)
-else()
- set_msvc_c_cpp_compiler_warning_level(4)
-endif()
+set_msvc_c_cpp_compiler_warning_level(4)
set(onnxruntime_DELAYLOAD_FLAGS "")
@@ -1249,67 +1327,30 @@ if (onnxruntime_USE_OPENVINO)
add_definitions(-DUSE_OPENVINO=1)
- if (EXISTS "$ENV{INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/version.txt")
- file(READ $ENV{INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/version.txt VER)
- endif()
-
- if (NOT DEFINED ENV{INTEL_OPENVINO_DIR})
- message(FATAL_ERROR "[Couldn't locate OpenVINO] OpenVINO may not have been initialized")
- endif()
-
- # Check OpenVINO version for support
- if ($ENV{INTEL_OPENVINO_DIR} MATCHES "2022.3")
- set(OPENVINO_VERSION "2022.3")
- add_definitions(-DOPENVINO_2022_3=1)
- elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.0")
- set(OPENVINO_VERSION "2023.0")
- add_definitions(-DOPENVINO_2023_0=1)
- elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.1")
- set(OPENVINO_VERSION "2023.1")
- add_definitions(-DOPENVINO_2023_1=1)
- elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.2")
- set(OPENVINO_VERSION "2023.2")
- add_definitions(-DOPENVINO_2023_1=1)
- elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "openvino")
- set(OPENVINO_VERSION "2023.2")
- add_definitions(-DOPENVINO_2023_2=1)
- else()
- message(FATAL_ERROR "Unsupported OpenVINO version: ${INTEL_OPENVINO_DIR}")
- endif()
-
- if (onnxruntime_USE_OPENVINO_GPU_FP32)
- add_definitions(-DOPENVINO_CONFIG_GPU_FP32=1)
- endif()
-
- if (onnxruntime_USE_OPENVINO_GPU_FP16)
- add_definitions(-DOPENVINO_CONFIG_GPU_FP16=1)
+ if (onnxruntime_USE_OPENVINO_GPU)
+ add_definitions(-DOPENVINO_CONFIG_GPU=1)
endif()
- if (onnxruntime_USE_OPENVINO_CPU_FP32)
- add_definitions(-DOPENVINO_CONFIG_CPU_FP32=1)
+ if (onnxruntime_USE_OPENVINO_CPU)
+ add_definitions(-DOPENVINO_CONFIG_CPU=1)
endif()
- if (onnxruntime_USE_OPENVINO_CPU_FP16)
- add_definitions(-DOPENVINO_CONFIG_CPU_FP16=1)
+ if (onnxruntime_USE_OPENVINO_NPU)
+ add_definitions(-DOPENVINO_CONFIG_NPU=1)
endif()
- if (onnxruntime_USE_OPENVINO_GPU_FP32_NP)
- add_definitions(-DOPENVINO_CONFIG_GPU_FP32=1)
+ if (onnxruntime_USE_OPENVINO_GPU_NP)
+ add_definitions(-DOPENVINO_CONFIG_GPU=1)
add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
endif()
- if (onnxruntime_USE_OPENVINO_GPU_FP16_NP)
- add_definitions(-DOPENVINO_CONFIG_GPU_FP16=1)
+ if (onnxruntime_USE_OPENVINO_CPU_NP)
+ add_definitions(-DOPENVINO_CONFIG_CPU=1)
add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
endif()
- if (onnxruntime_USE_OPENVINO_CPU_FP32_NP)
- add_definitions(-DOPENVINO_CONFIG_CPU_FP32=1)
- add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
- endif()
-
- if (onnxruntime_USE_OPENVINO_CPU_FP16_NP)
- add_definitions(-DOPENVINO_CONFIG_CPU_FP16=1)
+ if (onnxruntime_USE_OPENVINO_NPU_NP)
+ add_definitions(-DOPENVINO_CONFIG_NPU=1)
add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
endif()
@@ -1367,6 +1408,10 @@ endif()
if (onnxruntime_USE_CUDA)
set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
set(CMAKE_CUDA_STANDARD 17)
+ if(onnxruntime_CUDA_HOME)
+ file(TO_CMAKE_PATH CUDAToolkit_ROOT ${onnxruntime_CUDA_HOME})
+ endif()
+ find_package(CUDAToolkit REQUIRED)
if(onnxruntime_CUDNN_HOME)
file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME)
endif()
@@ -1408,6 +1453,11 @@ if (onnxruntime_USE_CUDA)
if (NOT WIN32)
list(APPEND CUDA_NVCC_FLAGS --compiler-options -fPIC)
endif()
+ if(MSVC)
+ if(CUDA_NVCC_FLAGS MATCHES "Zi")
+ list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-FS")
+ endif()
+ endif()
# Options passed to cudafe
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=bad_friend_decl\"")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=unsigned_compare_with_zero\"")
@@ -1418,16 +1468,6 @@ if (onnxruntime_USE_CUDA)
endif()
endif()
-if (onnxruntime_USE_TENSORRT)
- # needs to link with stdc++fs in Linux
- if (UNIX)
- if (NOT APPLE)
- set(FS_STDLIB stdc++fs)
- endif()
- endif()
- list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${FS_STDLIB})
-endif()
-
if (onnxruntime_USE_MIGRAPHX)
if (WIN32)
message(FATAL_ERROR "MIGraphX does not support build in Windows!")
@@ -1567,7 +1607,7 @@ if (UNIX AND onnxruntime_USE_NCCL)
else()
set(onnxruntime_USE_NCCL OFF)
set(onnxruntime_USE_MPI OFF)
-message( WARNING "MPI and NCCL disabled on Win build." )
+ message( WARNING "MPI and NCCL are disabled because build is on Windows or USE_NCCL is set to OFF." )
endif()
if (onnxruntime_USE_MPI)
@@ -1632,8 +1672,8 @@ if (onnxruntime_USE_WINML)
endif() # if (onnxruntime_USE_WINML)
if (onnxruntime_BUILD_SHARED_LIB OR onnxruntime_BUILD_APPLE_FRAMEWORK)
- if (onnxruntime_BUILD_APPLE_FRAMEWORK AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin|iOS")
- message(FATAL_ERROR "onnxruntime_BUILD_APPLE_FRAMEWORK can only be enabled for macOS or iOS.")
+ if (onnxruntime_BUILD_APPLE_FRAMEWORK AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin|iOS|visionOS")
+ message(FATAL_ERROR "onnxruntime_BUILD_APPLE_FRAMEWORK can only be enabled for macOS or iOS or visionOS.")
endif()
list(APPEND ONNXRUNTIME_CMAKE_FILES onnxruntime)
endif()
@@ -1696,14 +1736,12 @@ if(onnxruntime_BUILD_KERNEL_EXPLORER)
endif()
# When GDK_PLATFORM is set then WINAPI_FAMILY is defined in gdk_toolchain.cmake (along with other relevant flags/definitions).
-if (WIN32 AND NOT GDK_PLATFORM)
+if (WIN32 AND NOT GDK_PLATFORM AND NOT CMAKE_CROSSCOMPILING)
if (NOT CMAKE_CXX_STANDARD_LIBRARIES MATCHES kernel32.lib)
# On onecore, link to the onecore build of the MSVC runtime
get_filename_component(msvc_path "${CMAKE_C_COMPILER}/../../../.." ABSOLUTE)
link_directories(BEFORE "${msvc_path}/lib/onecore/${onnxruntime_target_platform}")
- # The .lib files in the MSVC runtime have a DEFAULITLIB entry for onecore.lib, which in turn links to reverse forwarders.
- # We ignore that entry and use onecore_apiset.lib instead, since system components must not rely on reverse forwarders.
- add_link_options("/NODEFAULTLIB:onecore.lib")
+ # The .lib files in the MSVC runtime have a DEFAULITLIB entry for onecore.lib, but it shold not cause any conflict with onecoreuap.lib
endif()
endif()
@@ -1783,5 +1821,5 @@ endif()
if(DEFINED BUILD_AS_ARM64X)
set(ARM64X_TARGETS onnxruntime)
- include("${CMAKE_SOURCE_DIR}/arm64x.cmake")
+ include("${CMAKE_CURRENT_SOURCE_DIR}/arm64x.cmake")
endif()
diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index e825bfeaea952..690b6d4e66154 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -8,6 +8,15 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Android")
string(APPEND CMAKE_ASM_FLAGS_RELEASE " -O3")
endif()
+# Suggested by https://gitlab.kitware.com/cmake/cmake/-/issues/20132
+# MacCatalyst is not well supported in CMake
+# The error that can emerge without this flag can look like:
+# "clang : error : overriding '-mmacosx-version-min=11.0' option with '-target x86_64-apple-ios14.0-macabi' [-Werror,-Woverriding-t-option]"
+if (PLATFORM_NAME STREQUAL "macabi")
+ add_compile_options(-Wno-overriding-t-option)
+ add_link_options(-Wno-overriding-t-option)
+endif()
+
# Enable space optimization for gcc/clang
# Cannot use "-ffunction-sections -fdata-sections" if we enable bitcode (iOS)
if (NOT MSVC AND NOT onnxruntime_ENABLE_BITCODE)
@@ -16,9 +25,7 @@ if (NOT MSVC AND NOT onnxruntime_ENABLE_BITCODE)
endif()
if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
- string(APPEND CMAKE_C_FLAGS " -s STRICT=1 -s DEFAULT_TO_CXX=1")
- string(APPEND CMAKE_CXX_FLAGS " -s STRICT=1 -s DEFAULT_TO_CXX=1")
- set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -s ALLOW_UNIMPLEMENTED_SYSCALLS=1")
+ set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -s ALLOW_UNIMPLEMENTED_SYSCALLS=1 -s DEFAULT_TO_CXX=1")
# Enable LTO for release single-thread build
if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -74,11 +81,6 @@ if (onnxruntime_MINIMAL_BUILD)
endif()
if (MSVC)
- # turn on LTO (which adds some compiler flags and turns on LTCG) unless it's a Debug build to minimize binary size
- if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
- set(onnxruntime_ENABLE_LTO ON)
- endif()
-
# undocumented internal flag to allow analysis of a minimal build binary size
if (ADD_DEBUG_INFO_TO_MINIMAL_BUILD)
string(APPEND CMAKE_CXX_FLAGS " /Zi")
@@ -99,7 +101,7 @@ if (onnxruntime_MINIMAL_BUILD)
endif()
endif()
-# enable stream for all the non-minimal build
+# Enable stream for all the non-minimal build
if (NOT onnxruntime_MINIMAL_BUILD)
add_compile_definitions(ORT_ENABLE_STREAM)
endif()
@@ -130,6 +132,11 @@ if (onnxruntime_DISABLE_RTTI)
add_compile_options("$<$:/GR->" "$<$:/we4541>")
else()
add_compile_options("$<$:-fno-rtti>")
+ if (onnxruntime_USE_WEBNN)
+ # Avoid unboundTypeError for WebNN EP since unbound type names are illegal with RTTI disabled
+ # in Embind API, relevant issue: https://github.com/emscripten-core/emscripten/issues/7001
+ add_compile_options("$<$:-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0>")
+ endif()
endif()
else()
#MSVC RTTI flag /GR is not added to CMAKE_CXX_FLAGS by default. But, anyway VC++2019 treats "/GR" default on.
@@ -197,9 +204,9 @@ if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0)
endif()
endif()
-# Mark symbols to be invisible, for macOS/iOS target only
+# Mark symbols to be invisible, for macOS/iOS/visionOS target only
# Due to many dependencies have different symbol visibility settings, set global compile flags here.
-if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin|iOS")
+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin|iOS|visionOS")
foreach(flags CMAKE_CXX_FLAGS CMAKE_OBJC_FLAGS CMAKE_OBJCXX_FLAGS)
string(APPEND ${flags} " -fvisibility=hidden -fvisibility-inlines-hidden")
endforeach()
@@ -207,7 +214,7 @@ endif()
macro(check_nvcc_compiler_flag _FLAG _RESULT)
- execute_process(COMMAND ${onnxruntime_CUDA_HOME}/bin/nvcc "${_FLAG}" RESULT_VARIABLE NVCC_OUT ERROR_VARIABLE NVCC_ERROR)
+ execute_process(COMMAND ${CUDAToolkit_BIN_DIR}/nvcc "${_FLAG}" RESULT_VARIABLE NVCC_OUT ERROR_VARIABLE NVCC_ERROR)
message("NVCC_ERROR = ${NVCC_ERROR}")
message("NVCC_OUT = ${NVCC_OUT}")
if ("${NVCC_OUT}" MATCHES "0")
@@ -267,39 +274,38 @@ if (MSVC)
string(APPEND CMAKE_C_FLAGS " /arch:AVX512")
endif()
- if (NOT GDK_PLATFORM)
- add_compile_definitions(WINAPI_FAMILY=100) # Desktop app
- message("Building ONNX Runtime for Windows 10 and newer")
- add_compile_definitions(WINVER=0x0A00 _WIN32_WINNT=0x0A00 NTDDI_VERSION=0x0A000000)
- endif()
if (onnxruntime_ENABLE_LTO AND NOT onnxruntime_USE_CUDA)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Gw /GL")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL")
set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /Gw /GL")
endif()
-
- # The WinML build tool chain builds ARM/ARM64, and the internal tool chain does not have folders for spectre mitigation libs.
- # WinML performs spectre mitigation differently.
- if (NOT DEFINED onnxruntime_DISABLE_QSPECTRE_CHECK)
- check_cxx_compiler_flag(-Qspectre HAS_QSPECTRE)
- if (HAS_QSPECTRE)
- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Qspectre")
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Qspectre")
- endif()
- endif()
- set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DYNAMICBASE")
- check_cxx_compiler_flag(-guard:cf HAS_GUARD_CF)
- if (HAS_GUARD_CF)
- set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /guard:cf")
- set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /guard:cf")
- set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} /guard:cf")
- set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /guard:cf")
- set(CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL} /guard:cf")
- set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /guard:cf")
- set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /guard:cf")
- endif()
else()
if (NOT APPLE)
+ #XXX: Sometimes the value of CMAKE_SYSTEM_PROCESSOR is set but it's wrong. For example, if you run an armv7 docker
+ #image on an aarch64 machine with an aarch64 Ubuntu host OS, in the docker instance cmake may still report
+ # CMAKE_SYSTEM_PROCESSOR as aarch64 by default. Given compiling this code may need more than 2GB memory, we do not
+ # support compiling for ARM32 natively(only support cross-compiling), we will ignore this issue for now.
+ if(NOT CMAKE_SYSTEM_PROCESSOR)
+ message(WARNING "CMAKE_SYSTEM_PROCESSOR is not set. Please set it in your toolchain cmake file.")
+ # Try to detect it
+ if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
+ execute_process(
+ COMMAND "${CMAKE_C_COMPILER}" -dumpmachine
+ OUTPUT_VARIABLE GCC_DUMP_MACHINE_OUT OUTPUT_STRIP_TRAILING_WHITESPACE
+ ERROR_VARIABLE _err
+ RESULT_VARIABLE _res
+ )
+ if(NOT _res EQUAL 0)
+ message(SEND_ERROR "Failed to run 'gcc -dumpmachine':\n ${_res}")
+ endif()
+ string(REPLACE "-" ";" GCC_DUMP_MACHINE_OUT_LIST "${GCC_DUMP_MACHINE_OUT}")
+ list(LENGTH GCC_DUMP_MACHINE_OUT_LIST GCC_TRIPLET_LEN)
+ if(GCC_TRIPLET_LEN EQUAL 4)
+ list(GET GCC_DUMP_MACHINE_OUT_LIST 0 CMAKE_SYSTEM_PROCESSOR)
+ message("Setting CMAKE_SYSTEM_PROCESSOR to ${CMAKE_SYSTEM_PROCESSOR}")
+ endif()
+ endif()
+ endif()
set(onnxruntime_target_platform ${CMAKE_SYSTEM_PROCESSOR})
endif()
if (onnxruntime_BUILD_FOR_NATIVE_MACHINE)
@@ -353,16 +359,9 @@ else()
endif()
-if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
- #For Mac compliance
- message("Adding flags for Mac builds")
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector-strong")
-elseif (WIN32)
- # parallel build
- # These compiler opitions cannot be forwarded to NVCC, so cannot use add_compiler_options
- string(APPEND CMAKE_CXX_FLAGS " /MP")
+if (WIN32)
# required to be set explicitly to enable Eigen-Unsupported SpecialFunctions
string(APPEND CMAKE_CXX_FLAGS " -DEIGEN_HAS_C99_MATH")
-else()
+elseif(LINUX)
add_compile_definitions("_GNU_SOURCE")
endif()
diff --git a/cmake/arm64x.cmake b/cmake/arm64x.cmake
index be476e09625bd..525206367ce95 100644
--- a/cmake/arm64x.cmake
+++ b/cmake/arm64x.cmake
@@ -1,33 +1,36 @@
-set(arm64ReproDir "${CMAKE_SOURCE_DIR}/repros")
+set(arm64ReproDir "${CMAKE_CURRENT_SOURCE_DIR}/repros")
+
+function(set_arm64_dependencies n)
+ set(ARM64_LIBS)
+ set(ARM64_OBJS)
+ set(ARM64_DEF)
+ set(REPRO_PATH "${arm64ReproDir}/${n}")
+ if(NOT EXISTS "${REPRO_PATH}")
+ set(REPRO_PATH "${arm64ReproDir}/${n}_temp")
+ endif()
+ file(GLOB ARM64_OBJS "${REPRO_PATH}/*.obj")
+ file(GLOB ARM64_DEF "${REPRO_PATH}/*.def")
+ file(GLOB ARM64_LIBS "${REPRO_PATH}/*.LIB")
+
+ if(NOT "${ARM64_DEF}" STREQUAL "")
+ set(ARM64_DEF "/defArm64Native:${ARM64_DEF}")
+ endif()
+ target_sources(${n} PRIVATE ${ARM64_OBJS})
+ target_link_options(${n} PRIVATE /machine:arm64x "${ARM64_DEF}" "${ARM64_LIBS}")
+endfunction()
if("${BUILD_AS_ARM64X}" STREQUAL "ARM64")
foreach (n ${ARM64X_TARGETS})
add_custom_target(mkdirs_${n} ALL COMMAND cmd /c (if exist \"${arm64ReproDir}/${n}_temp/\" rmdir /s /q \"${arm64ReproDir}/${n}_temp\") && mkdir \"${arm64ReproDir}/${n}_temp\" )
add_dependencies(${n} mkdirs_${n})
target_link_options(${n} PRIVATE "/LINKREPRO:${arm64ReproDir}/${n}_temp")
- add_custom_target(${n}_checkRepro ALL COMMAND cmd /c if exist \"${n}_temp/*.obj\" if exist \"${n}\" rmdir /s /q \"${n}\" 2>nul && if not exist \"${n}\" ren \"${n}_temp\" \"${n}\" DEPENDS ${n}
- WORKING_DIRECTORY ${arm64ReproDir})
+ add_custom_target(${n}_checkRepro ALL COMMAND cmd /c if exist \"${n}_temp/*.obj\" if exist \"${n}\" rmdir /s /q \"${n}\" 2>nul && if not exist \"${n}\" ren \"${n}_temp\" \"${n}\" WORKING_DIRECTORY ${arm64ReproDir})
+ add_dependencies(${n}_checkRepro ${n})
endforeach()
elseif("${BUILD_AS_ARM64X}" STREQUAL "ARM64EC")
foreach (n ${ARM64X_TARGETS})
- set(ARM64_LIBS)
- set(ARM64_OBJS)
- set(ARM64_DEF)
-
- file(GLOB ARM64_OBJS "${arm64ReproDir}/${n}/*.obj")
- file(GLOB ARM64_DEF "${arm64ReproDir}/${n}/*.def")
- file(GLOB ARM64_LIBS "${arm64ReproDir}/${n}/*.LIB")
-
- if(NOT "${ARM64_DEF}" STREQUAL "")
- set(ARM64_DEF "/defArm64Native:${ARM64_DEF}")
- endif()
- target_sources(${n} PRIVATE ${ARM64_OBJS})
- target_link_options(${n} PRIVATE /machine:arm64x "${ARM64_DEF}")
-
- if(NOT "${ARM64_LIBS}" STREQUAL "")
- target_link_libraries(${n} PUBLIC ${ARM64_LIBS})
- endif()
+ set_arm64_dependencies(${n})
endforeach()
endif()
diff --git a/cmake/deps.txt b/cmake/deps.txt
index ff07803013071..88c1881ad82fb 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -12,7 +12,8 @@
# NOTE: You must run deps_update_and_upload.py and generate_cgmanifest.py when ready to test your changes in a CI.
# See https://microsoft.sharepoint.com/teams/ONNX2/_layouts/OneNote.aspx?id=%2Fteams%2FONNX2%2FShared%20Documents%2FNotebooks%2FONNX%20Ecosystem%20Team%20Notebook&wd=target%28Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FAdd%20or%20update%20a%20dependency%20in%20deps.txt%7C0E9ED71D-89D5-40FA-B05F-C0123289C591%2F%29
#
-abseil_cpp;https://github.com/abseil/abseil-cpp/archive/dcd5bd5fd593e31465af3d9ef291d26c646b0a4f.zip;6cc204586014e189f5c0fe3274f83162fa7c700c
+abseil_cpp;https://github.com/abseil/abseil-cpp/archive/refs/tags/20240116.0.zip;bc2cec6baaad67fcb6c0c38972b687d4797927e9
+coremltools;https://github.com/apple/coremltools/archive/refs/tags/7.1.zip;f1bab0f30966f2e217d8e01207d518f230a1641a
cxxopts;https://github.com/jarro2783/cxxopts/archive/3c73d91c0b04e2b59462f0a741be8c07024c1bc0.zip;6c6ca7f8480b26c8d00476e0e24b7184717fe4f0
date;https://github.com/HowardHinnant/date/archive/refs/tags/v3.0.1.zip;2dac0c81dc54ebdd8f8d073a75c053b04b56e159
dlpack;https://github.com/dmlc/dlpack/archive/refs/tags/v0.6.zip;4d565dd2e5b31321e5549591d78aa7f377173445
@@ -22,10 +23,10 @@ dlpack;https://github.com/dmlc/dlpack/archive/refs/tags/v0.6.zip;4d565dd2e5b3132
# Until the 3.4.1 release this is the best option we have.
# Issue link: https://gitlab.com/libeigen/eigen/-/issues/2744
eigen;https://gitlab.com/libeigen/eigen/-/archive/e7248b26a1ed53fa030c5c459f7ea095dfd276ac/eigen-e7248b26a1ed53fa030c5c459f7ea095dfd276ac.zip;be8be39fdbc6e60e94fa7870b280707069b5b81a
-flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v1.12.0.zip;ba0a75fd12dbef8f6557a74e611b7a3d0c5fe7bf
+flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v23.5.26.zip;59422c3b5e573dd192fead2834d25951f1c1670c
fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494
fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1
-google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.7.0.zip;e97c368b176e8614e3f1bf13dd9abcf6a7ad9908
+google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.8.3.zip;bf9870756ee3f8d2d3b346b24ee3600a41c74d3d
google_nsync;https://github.com/google/nsync/archive/refs/tags/1.26.0.zip;5e7c00ef6bf5b787386fc040067903ec774e2752
googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
googlexnnpack;https://github.com/google/XNNPACK/archive/0da379fc4808f9601faef392352018c741c0f297.zip;663883491e380b628e0a5b162b5f2658032fae73
@@ -34,9 +35,10 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.15.0.zip;54c3f960a0541c5d8d3e60c2933e11f5d3688a11
-#use the commit of supporting all the plugins and TRT 8.6-GA (https://github.com/onnx/onnx-tensorrt/commit/0462dc31ae78f48744b6141ae376df1f96d3f459)
-onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/a43ce67187bab219520fd80f21af8bbd4354bc8c.zip;572535aefef477050f86744dfab1fef840198035
+neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/v0.3.zip;5ec64e3071edc7347ebd8a81679cf06e2bb9b851
+onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.1.zip;2eb9198bb352757d5ff13977cbe0634898e0837c
+#use the latest commit of 10.0-GA
+onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/06adf4461ac84035bee658c6cf5df39f7ab6071d.zip;46dceef659d75d276e7914a8057c2282269d5e7b
protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874
@@ -48,10 +50,11 @@ psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e9
pthreadpool;https://github.com/Maratyszcza/pthreadpool/archive/4fe0e1e183925bf8cfa6aae24237e724a96479b8.zip;07a0aa91dd9bf86f31b95497e00f31d8a261a4bd
pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.10.1.zip;769b6aa67a77f17a770960f604b727645b6f6a13
pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/959002f82d7962a473d8bf301845f2af720e0aa4.zip;85da3caa60eb2b148613b443fbc2bfdc30689965
-re2;https://github.com/google/re2/archive/refs/tags/2022-06-01.zip;aa77313b76e91b531ee7f3e45f004c6a502a5374
+re2;https://github.com/google/re2/archive/refs/tags/2024-05-01.tar.gz;206cfee5ee0b4c6844680ba66275e9e8faa77405
safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac
tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381
cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.1.0.zip;757f90a795034a89d4f48a79d1f009f7a04c8dee
utf8_range;https://github.com/protocolbuffers/utf8_range/archive/72c943dea2b9240cd09efde15191e144bc7c7d38.zip;9925739c9debc0efa2adcb194d371a35b6a03156
extensions;https://github.com/microsoft/onnxruntime-extensions/archive/94142d8391c9791ec71c38336436319a2d4ac7a0.zip;4365ac5140338b4cb75a39944a4be276e3829b3c
composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/5356c4a943a35e74d7cdc69486afcb8703b9a59a.zip;522382c2af437e09124287e5879ab64af5b2e299
+directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
diff --git a/cmake/deps_update_and_upload.py b/cmake/deps_update_and_upload.py
index d357284d91225..63df3f6f03869 100644
--- a/cmake/deps_update_and_upload.py
+++ b/cmake/deps_update_and_upload.py
@@ -1,56 +1,109 @@
-# in case deps.txt is updated, run this file to update and upload the dependencies so that CI can use them.
-# Before running the script, increase the version number found at:
+# If deps.txt is updated, run this file to update and upload the dependencies so that CI can use them.
+#
+# Before running the script, find the latest version number at:
# https://aiinfra.visualstudio.com/Lotus/_artifacts/feed/Lotus/UPack/onnxruntime_build_dependencies/versions
+# Increment it to obtain a new version number to use.
+#
# Run without --do-upload once to verify downloading. Use --do-upload when you are ready to publish.
-# python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.82 --do-upload
-# update version number in tools\ci_build\github\azure-pipelines\templates\download-deps.yml
+# E.g.:
+# python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.82
+# # check contents of C:/temp/onnxruntime_deps
+# python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.82 --no-download --do-upload
+#
+# Next, update the version number in tools/ci_build/github/azure-pipelines/templates/download-deps.yml.
+
+import argparse
+import contextlib
+import pathlib
import re
import subprocess
-import os
-import argparse
import tempfile
+script_dir = pathlib.Path(__file__).parent
+
parser = argparse.ArgumentParser(description="Update dependencies and publish to Azure Artifacts")
parser.add_argument(
- "--root-path", type=str, default=tempfile.gettempdir(), help="Target root path for downloaded files"
+ "--root-path",
+ type=pathlib.Path,
+ help="Target root path for downloaded files. If not provided, a temporary directory is used.",
+)
+parser.add_argument(
+ "--version",
+ type=str,
+ help="Package version to publish",
+)
+parser.add_argument(
+ "--do-upload",
+ action="store_true",
+ dest="upload",
+ help="Upload the package to Azure Artifacts",
+)
+parser.add_argument(
+ "--no-download",
+ action="store_false",
+ dest="download",
+ help="Skip downloading the dependency files. "
+ "Use with '--do-upload' and '--root-path' to upload the package from existing dependency files.",
)
-parser.add_argument("--version", type=str, default="1.0.82", help="Package version to publish")
-parser.add_argument("--do-upload", action="store_true", help="Upload the package to Azure Artifacts")
args = parser.parse_args()
-with open("cmake/deps.txt") as file:
+if args.upload:
+ assert args.version is not None, "'--version' must be specified if uploading."
+
+if args.upload != args.download:
+ assert args.root_path is not None, "'--root-path' must be specified if only downloading or uploading."
+
+deps_path = script_dir / "deps.txt"
+with open(deps_path) as file:
text = file.read()
lines = [line for line in text.split("\n") if not line.startswith("#") and ";" in line]
-root_path = args.root_path
-
-for line in lines:
- url = re.sub("^[^;]+?;https://([^;]+?);.*", r"https://\1", line)
- filename = re.sub("^[^;]+?;https://([^;]+?);.*", r"\1", line)
- full_path = os.path.join(root_path, filename)
- subprocess.run(["curl", "-sSL", "--create-dirs", "-o", full_path, url]) # noqa: PLW1510
-
-package_name = "onnxruntime_build_dependencies"
-version = args.version
-
-# Check if the user is logged in to Azure
-result = subprocess.run("az account show", shell=True, capture_output=True, text=True) # noqa: PLW1510
-if "No subscriptions found" in result.stderr:
- # Prompt the user to log in to Azure
- print("You are not logged in to Azure. Please log in to continue.")
- subprocess.run("az login", shell=True) # noqa: PLW1510
-
-# Publish the package to Azure Artifacts if --no-upload is not specified
-
-cmd = f'az artifacts universal publish --organization https://dev.azure.com/onnxruntime --feed onnxruntime --name {package_name} --version {version} --description "onnxruntime build time dependencies" --path {root_path}'
-if args.do_upload:
- subprocess.run(cmd, shell=True) # noqa: PLW1510
-else:
- print("would have run: " + cmd)
-
-cmd = f'az artifacts universal publish --organization https://dev.azure.com/aiinfra --feed Lotus --name {package_name} --version {version} --description "onnxruntime build time dependencies" --path {root_path}'
-if args.do_upload:
- subprocess.run(cmd, shell=True) # noqa: PLW1510
-else:
- print("would have run: " + cmd)
+with contextlib.ExitStack() as context_stack:
+ if args.root_path is not None:
+ root_path = args.root_path.resolve()
+ root_path.mkdir(parents=True, exist_ok=True)
+ else:
+ temp_dir_name = context_stack.enter_context(tempfile.TemporaryDirectory())
+ root_path = pathlib.Path(temp_dir_name)
+
+ if args.download:
+ print(f"Downloading dependencies to directory: {root_path}")
+
+ dep_pattern = re.compile(r"^[^;]+;https://([^;]+);.*$")
+
+ for line in lines:
+ match = dep_pattern.fullmatch(line)
+ if match is None:
+ continue
+
+ dep_path = match[1]
+ url = f"https://{dep_path}"
+ full_path = root_path / dep_path
+
+ subprocess.run(["curl", "-sSL", "--create-dirs", "-o", str(full_path), url], check=True)
+
+ package_name = "onnxruntime_build_dependencies"
+ version = args.version if args.version is not None else "VERSION_PLACEHOLDER"
+
+ if args.upload:
+ # Check if the user is logged in to Azure
+ result = subprocess.run("az account show", shell=True, capture_output=True, text=True, check=False)
+ if "No subscriptions found" in result.stderr:
+ # Prompt the user to log in to Azure
+ print("You are not logged in to Azure. Please log in to continue.")
+ subprocess.run("az login", shell=True, check=True)
+
+ # Publish the package to Azure Artifacts if --do-upload is specified
+
+ cmd = f'az artifacts universal publish --organization https://dev.azure.com/onnxruntime --feed onnxruntime --name {package_name} --version {version} --description "onnxruntime build time dependencies" --path {root_path}'
+ if args.upload:
+ subprocess.run(cmd, shell=True, check=True)
+ else:
+ print("would have run: " + cmd)
+
+ cmd = f'az artifacts universal publish --organization https://dev.azure.com/aiinfra --feed Lotus --name {package_name} --version {version} --description "onnxruntime build time dependencies" --path {root_path}'
+ if args.upload:
+ subprocess.run(cmd, shell=True, check=True)
+ else:
+ print("would have run: " + cmd)
diff --git a/cmake/external/abseil-cpp.cmake b/cmake/external/abseil-cpp.cmake
index 3bcd4109e2888..c01195c99e28d 100644
--- a/cmake/external/abseil-cpp.cmake
+++ b/cmake/external/abseil-cpp.cmake
@@ -19,7 +19,7 @@ if(WIN32 AND NOT Patch_FOUND)
set(ABSL_ENABLE_INSTALL ON)
endif()
# NB! Advancing Abseil version changes its internal namespace,
-# currently absl::lts_20230125 which affects abseil-cpp.natvis debugger
+# currently absl::lts_20240116 which affects abseil-cpp.natvis debugger
# visualization file, that must be adjusted accordingly, unless we eliminate
# that namespace at build time.
FetchContent_Declare(
@@ -45,10 +45,8 @@ endif()
# TODO: since multiple ORT's dependencies depend on Abseil, the list below would vary from version to version.
# We'd better to not manually manage the list.
-set(ABSEIL_LIBS absl::base
+set(ABSEIL_LIBS
absl::city
-absl::core_headers
-absl::fixed_array
absl::flags
absl::flat_hash_map
absl::flat_hash_set
@@ -60,9 +58,34 @@ absl::node_hash_set
absl::optional
absl::raw_hash_set
absl::raw_logging_internal
-absl::span
absl::str_format
-absl::strings
+absl::str_format_internal
+absl::bits
+absl::fixed_array
+absl::numeric_representation
+absl::utility
+absl::type_traits
+absl::string_view
+absl::core_headers
+absl::nullability
+absl::span
+absl::config
absl::synchronization
+absl::base
+absl::civil_time
+absl::debugging_internal
+absl::demangle_internal
+absl::graphcycles_internal
+absl::int128
+absl::kernel_timeout_internal
+absl::log_severity
+absl::malloc_internal
+absl::spinlock_wait
+absl::stacktrace
+absl::string_view
+absl::strings
+absl::strings_internal
+absl::symbolize
absl::throw_delegate
-absl::time)
+absl::time
+absl::time_zone)
\ No newline at end of file
diff --git a/cmake/external/abseil-cpp.natvis b/cmake/external/abseil-cpp.natvis
index 1e5a36fb9efb9..a4fb63b6a8377 100644
--- a/cmake/external/abseil-cpp.natvis
+++ b/cmake/external/abseil-cpp.natvis
@@ -1,6 +1,6 @@
-
+
@@ -24,7 +24,7 @@
-
+
@@ -51,7 +51,7 @@
-
+
*($T1 *){value}
(*($T1 *){value})
@@ -60,7 +60,7 @@
-
+
*($T1 *)this
(*($T1 *)this)
@@ -68,7 +68,7 @@
-
+
{value.first}, {value.second}
({value.first}, {value.second})
diff --git a/cmake/external/cutlass.cmake b/cmake/external/cutlass.cmake
index 983eecdd88235..f04f4bec76cd5 100644
--- a/cmake/external/cutlass.cmake
+++ b/cmake/external/cutlass.cmake
@@ -1,13 +1,11 @@
-if (onnxruntime_USE_FLASH_ATTENTION OR onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION)
- include(FetchContent)
- FetchContent_Declare(
- cutlass
- URL ${DEP_URL_cutlass}
- URL_HASH SHA1=${DEP_SHA1_cutlass}
- )
+include(FetchContent)
+FetchContent_Declare(
+ cutlass
+ URL ${DEP_URL_cutlass}
+ URL_HASH SHA1=${DEP_SHA1_cutlass}
+)
- FetchContent_GetProperties(cutlass)
- if(NOT cutlass_POPULATED)
- FetchContent_Populate(cutlass)
- endif()
+FetchContent_GetProperties(cutlass)
+if(NOT cutlass_POPULATED)
+ FetchContent_Populate(cutlass)
endif()
diff --git a/cmake/external/dml.cmake b/cmake/external/dml.cmake
index 5d25b9529e030..f74b694471203 100644
--- a/cmake/external/dml.cmake
+++ b/cmake/external/dml.cmake
@@ -41,7 +41,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config)
set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config)
get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE)
- set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.12.1)
+ set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.14.1)
# Restore nuget packages, which will pull down the DirectML redist package.
add_custom_command(
@@ -72,12 +72,11 @@ else()
if (dml_EXTERNAL_PROJECT)
set(dml_preset_config $,debug,release>)
set(dml_preset_name ${onnxruntime_target_platform}-win-redist-${dml_preset_config})
- target_compile_definitions(DirectML INTERFACE DML_TARGET_VERSION_USE_LATEST=1)
include(ExternalProject)
ExternalProject_Add(
directml_repo
GIT_REPOSITORY https://dev.azure.com/microsoft/WindowsAI/_git/DirectML
- GIT_TAG d460f0f46967bea878786f1bed69487692c779bf
+ GIT_TAG a5312f72c51864b4d705ac62d25d08bcd88c4fb1
GIT_SHALLOW OFF # not allowed when GIT_TAG is a commit SHA, which is preferred (it's stable, unlike branches)
GIT_PROGRESS ON
BUILD_IN_SOURCE ON
@@ -94,8 +93,20 @@ else()
target_link_libraries(DirectML INTERFACE ${directml_install_path}/lib/DirectML.lib)
add_dependencies(DirectML directml_repo-install)
include_directories(BEFORE ${directml_install_path}/include)
+ target_compile_definitions(DirectML INTERFACE DML_TARGET_VERSION_USE_LATEST=1)
else()
include_directories(BEFORE ${dml_INCLUDE_DIR})
set(DML_PACKAGE_DIR ${dml_INCLUDE_DIR}/..)
endif()
endif()
+
+FetchContent_Declare(
+ directx_headers
+ URL ${DEP_URL_directx_headers}
+ URL_HASH SHA1=${DEP_SHA1_directx_headers}
+)
+
+FetchContent_Populate(directx_headers)
+set(directx_headers_INCLUDE_DIRS "${directx_headers_SOURCE_DIR}/include")
+
+include_directories(BEFORE ${directx_headers_INCLUDE_DIRS})
diff --git a/cmake/external/dnnl.cmake b/cmake/external/dnnl.cmake
index d7b70640781d0..9eb5fed7a1af6 100644
--- a/cmake/external/dnnl.cmake
+++ b/cmake/external/dnnl.cmake
@@ -2,7 +2,7 @@ include (ExternalProject)
set(DNNL_URL https://github.com/oneapi-src/onednn.git)
# If DNNL_TAG is updated, check if MKLML_VERSION and platform.cmake.patch need to be updated.
-set(DNNL_TAG v3.0)
+set(DNNL_TAG v3.0.1)
if(WIN32)
set(DNNL_SHARED_LIB dnnl.dll)
diff --git a/cmake/external/emsdk b/cmake/external/emsdk
index a896e3d066448..d52c465201248 160000
--- a/cmake/external/emsdk
+++ b/cmake/external/emsdk
@@ -1 +1 @@
-Subproject commit a896e3d066448b3530dbcaa48869fafefd738f57
+Subproject commit d52c46520124845b1e0e0525f2759299d840143f
diff --git a/cmake/external/helper_functions.cmake b/cmake/external/helper_functions.cmake
index 768e807b40600..eefb3ba2e800a 100644
--- a/cmake/external/helper_functions.cmake
+++ b/cmake/external/helper_functions.cmake
@@ -159,7 +159,12 @@ macro(onnxruntime_fetchcontent_makeavailable)
endif()
if(EXISTS ${__cmake_srcdir}/CMakeLists.txt)
+ set(CMAKE_SKIP_INSTALL_RULES TRUE)
+ if (__cmake_arg_SYSTEM)
+ add_subdirectory(${__cmake_srcdir} ${${__cmake_contentNameLower}_BINARY_DIR} SYSTEM)
+ else()
add_subdirectory(${__cmake_srcdir} ${${__cmake_contentNameLower}_BINARY_DIR} EXCLUDE_FROM_ALL)
+ endif()
get_property(subdir_import_targets DIRECTORY "${__cmake_srcdir}" PROPERTY BUILDSYSTEM_TARGETS)
foreach(subdir_target ${subdir_import_targets})
if(TARGET ${subdir_target})
@@ -176,6 +181,7 @@ macro(onnxruntime_fetchcontent_makeavailable)
set_target_properties(${subdir_target} PROPERTIES COMPILE_WARNING_AS_ERROR OFF)
endif()
endforeach()
+ set(CMAKE_SKIP_INSTALL_RULES FALSE)
endif()
unset(__cmake_srcdir)
diff --git a/cmake/external/neural_speed.cmake b/cmake/external/neural_speed.cmake
new file mode 100644
index 0000000000000..3fe9c660f89d6
--- /dev/null
+++ b/cmake/external/neural_speed.cmake
@@ -0,0 +1,16 @@
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND onnxruntime_target_platform STREQUAL "x86_64")
+ set(USE_NEURAL_SPEED TRUE)
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" AND onnxruntime_target_platform STREQUAL "x64")
+ set(USE_NEURAL_SPEED TRUE)
+endif()
+
+if(USE_NEURAL_SPEED)
+ FetchContent_Declare(
+ neural_speed
+ URL ${DEP_URL_neural_speed}
+ URL_HASH SHA1=${DEP_SHA1_neural_speed}
+ PATCH_COMMAND ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch
+ )
+ set(BTLA_USE_OPENMP OFF)
+ onnxruntime_fetchcontent_makeavailable(neural_speed)
+endif()
diff --git a/cmake/external/onnx b/cmake/external/onnx
index b86cc54efce19..595228d99e397 160000
--- a/cmake/external/onnx
+++ b/cmake/external/onnx
@@ -1 +1 @@
-Subproject commit b86cc54efce19530fb953e4b21f57e6b3888534c
+Subproject commit 595228d99e3977ac27cb79d5963adda262af99ad
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 78f63227c8392..775576a771529 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -14,6 +14,16 @@ foreach(ONNXRUNTIME_DEP IN LISTS ONNXRUNTIME_DEPS_LIST)
set(DEP_URL_${ONNXRUNTIME_DEP_NAME} ${ONNXRUNTIME_DEP_URL})
# The third column is SHA1 hash value
set(DEP_SHA1_${ONNXRUNTIME_DEP_NAME} ${ONNXRUNTIME_DEP})
+
+ if(ONNXRUNTIME_DEP_URL MATCHES "^https://")
+ # Search a local mirror folder
+ string(REGEX REPLACE "^https://" "${REPO_ROOT}/mirror/" LOCAL_URL "${ONNXRUNTIME_DEP_URL}")
+
+ if(EXISTS "${LOCAL_URL}")
+ cmake_path(ABSOLUTE_PATH LOCAL_URL)
+ set(DEP_URL_${ONNXRUNTIME_DEP_NAME} "${LOCAL_URL}")
+ endif()
+ endif()
endif()
endforeach()
@@ -37,8 +47,13 @@ if (onnxruntime_BUILD_UNIT_TESTS)
set(gtest_disable_pthreads ON)
endif()
set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
- if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
- # Needs to update onnxruntime/test/xctest/xcgtest.mm
+ if (IOS OR ANDROID)
+ # on mobile platforms the absl flags class dumps the flag names (assumably for binary size), which breaks passing
+ # any args to gtest executables, such as using --gtest_filter to debug a specific test.
+ # Processing of compile definitions:
+ # https://github.com/abseil/abseil-cpp/blob/8dc90ff07402cd027daec520bb77f46e51855889/absl/flags/config.h#L21
+ # If set, this code throws away the flag and does nothing on registration, which results in no flags being known:
+ # https://github.com/abseil/abseil-cpp/blob/8dc90ff07402cd027daec520bb77f46e51855889/absl/flags/flag.h#L205-L217
set(GTEST_HAS_ABSL OFF CACHE BOOL "" FORCE)
else()
set(GTEST_HAS_ABSL ON CACHE BOOL "" FORCE)
@@ -82,7 +97,6 @@ FetchContent_Declare(
)
-
# Flatbuffers
# We do not need to build flatc for iOS or Android Cross Compile
if (CMAKE_SYSTEM_NAME STREQUAL "iOS" OR CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
@@ -104,45 +118,31 @@ FetchContent_Declare(
URL ${DEP_URL_flatbuffers}
URL_HASH SHA1=${DEP_SHA1_flatbuffers}
PATCH_COMMAND ${ONNXRUNTIME_FLATBUFFERS_PATCH_COMMAND}
- FIND_PACKAGE_ARGS 1.12.0...<2.0.0 NAMES Flatbuffers
+ FIND_PACKAGE_ARGS 23.5.9 NAMES Flatbuffers
+)
+
+
+#Protobuf depends on utf8_range
+FetchContent_Declare(
+ utf8_range
+ URL ${DEP_URL_utf8_range}
+ URL_HASH SHA1=${DEP_SHA1_utf8_range}
+ FIND_PACKAGE_ARGS NAMES utf8_range
)
+set(utf8_range_ENABLE_TESTS OFF CACHE BOOL "Build test suite" FORCE)
+set(utf8_range_ENABLE_INSTALL OFF CACHE BOOL "Configure installation" FORCE)
+
+
# Download a protoc binary from Internet if needed
-if(CMAKE_CROSSCOMPILING AND NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
+if(NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
# This part of code is only for users' convenience. The code couldn't handle all cases. Users always can manually
# download protoc from Protobuf's Github release page and pass the local path to the ONNX_CUSTOM_PROTOC_EXECUTABLE
# variable.
- message("CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
- if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
- if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
- FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win64} URL_HASH SHA1=${DEP_SHA1_protoc_win64})
- FetchContent_Populate(protoc_binary)
- elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86")
- FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win32} URL_HASH SHA1=${DEP_SHA1_protoc_win32})
- FetchContent_Populate(protoc_binary)
- endif()
- if(protoc_binary_SOURCE_DIR)
- message("Use prebuilt protoc")
- set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc.exe)
- set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
- endif()
- elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux")
- if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
- FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x64})
- FetchContent_Populate(protoc_binary)
- elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
- FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x86} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x86})
- FetchContent_Populate(protoc_binary)
- elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
- FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_aarch64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_aarch64})
- FetchContent_Populate(protoc_binary)
- endif()
- if(protoc_binary_SOURCE_DIR)
- message("Use prebuilt protoc")
- set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
- set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
- endif()
- elseif ((CMAKE_SYSTEM_NAME STREQUAL "Emscripten" OR CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "iOS") AND CMAKE_HOST_SYSTEM_NAME STREQUAL "Darwin")
+ if (CMAKE_HOST_APPLE)
+ # Using CMAKE_CROSSCOMPILING is not recommended for Apple target devices.
+ # https://cmake.org/cmake/help/v3.26/variable/CMAKE_CROSSCOMPILING.html
+ # To keep it simple, just download and use the universal protoc binary for all Apple host builds.
FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_mac_universal} URL_HASH SHA1=${DEP_SHA1_protoc_mac_universal})
FetchContent_Populate(protoc_binary)
if(protoc_binary_SOURCE_DIR)
@@ -150,9 +150,57 @@ if(CMAKE_CROSSCOMPILING AND NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
endif()
+ elseif (CMAKE_CROSSCOMPILING)
+ message("CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
+ if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+ if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
+ FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win64} URL_HASH SHA1=${DEP_SHA1_protoc_win64})
+ FetchContent_Populate(protoc_binary)
+ elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86")
+ FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win32} URL_HASH SHA1=${DEP_SHA1_protoc_win32})
+ FetchContent_Populate(protoc_binary)
+ endif()
+
+ if(protoc_binary_SOURCE_DIR)
+ message("Use prebuilt protoc")
+ set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc.exe)
+ set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
+ endif()
+ elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux")
+ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
+ FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x64})
+ FetchContent_Populate(protoc_binary)
+ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
+ FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x86} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x86})
+ FetchContent_Populate(protoc_binary)
+ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
+ FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_aarch64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_aarch64})
+ FetchContent_Populate(protoc_binary)
+ endif()
+
+ if(protoc_binary_SOURCE_DIR)
+ message("Use prebuilt protoc")
+ set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
+ set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
+ endif()
+ endif()
+
+ if(NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
+ message(FATAL_ERROR "ONNX_CUSTOM_PROTOC_EXECUTABLE must be set to cross-compile.")
+ endif()
endif()
endif()
+# if ONNX_CUSTOM_PROTOC_EXECUTABLE is set we don't need to build the protoc binary
+if (ONNX_CUSTOM_PROTOC_EXECUTABLE)
+ if (NOT EXISTS "${ONNX_CUSTOM_PROTOC_EXECUTABLE}")
+ message(FATAL_ERROR "ONNX_CUSTOM_PROTOC_EXECUTABLE is set to '${ONNX_CUSTOM_PROTOC_EXECUTABLE}' "
+ "but protoc executable was not found there.")
+ endif()
+
+ set(protobuf_BUILD_PROTOC_BINARIES OFF CACHE BOOL "Build protoc" FORCE)
+endif()
+
#Here we support two build mode:
#1. if ONNX_CUSTOM_PROTOC_EXECUTABLE is set, build Protobuf from source, except protoc.exe. This mode is mainly
# for cross-compiling
@@ -163,17 +211,6 @@ else()
set(ONNXRUNTIME_PROTOBUF_PATCH_COMMAND "")
endif()
-FetchContent_Declare(
- utf8_range
- URL ${DEP_URL_utf8_range}
- URL_HASH SHA1=${DEP_SHA1_utf8_range}
- FIND_PACKAGE_ARGS NAMES utf8_range
-)
-
-set(utf8_range_ENABLE_TESTS OFF CACHE BOOL "Build test suite" FORCE)
-set(utf8_range_ENABLE_INSTALL OFF CACHE BOOL "Configure installation" FORCE)
-
-
#Protobuf depends on absl and utf8_range
FetchContent_Declare(
Protobuf
@@ -184,17 +221,17 @@ FetchContent_Declare(
)
set(protobuf_BUILD_TESTS OFF CACHE BOOL "Build protobuf tests" FORCE)
-#TODO: we'd better to turn the following option off. However, it will cause
+#TODO: we'd better to turn the following option off. However, it will cause
# ".\build.bat --config Debug --parallel --skip_submodule_sync --update" fail with an error message:
-# install(EXPORT "ONNXTargets" ...) includes target "onnx_proto" which requires target "libprotobuf-lite" that is
+# install(EXPORT "ONNXTargets" ...) includes target "onnx_proto" which requires target "libprotobuf-lite" that is
# not in any export set.
#set(protobuf_INSTALL OFF CACHE BOOL "Install protobuf binaries and files" FORCE)
set(protobuf_USE_EXTERNAL_GTEST ON CACHE BOOL "" FORCE)
-if (CMAKE_SYSTEM_NAME STREQUAL "Android")
- set(protobuf_BUILD_PROTOC_BINARIES OFF CACHE BOOL "Build protobuf tests" FORCE)
- set(protobuf_WITH_ZLIB OFF CACHE BOOL "Build with zlib support" FORCE)
+if (ANDROID)
+ set(protobuf_WITH_ZLIB OFF CACHE BOOL "Build protobuf with zlib support" FORCE)
endif()
+
if (onnxruntime_DISABLE_RTTI)
set(protobuf_DISABLE_RTTI ON CACHE BOOL "Remove runtime type information in the binaries" FORCE)
endif()
@@ -219,8 +256,6 @@ FetchContent_Declare(
URL_HASH SHA1=${DEP_SHA1_mp11}
)
-set(JSON_BuildTests OFF CACHE INTERNAL "")
-set(JSON_Install OFF CACHE INTERNAL "")
set(JSON_BuildTests OFF CACHE INTERNAL "")
set(JSON_Install OFF CACHE INTERNAL "")
@@ -253,14 +288,7 @@ if (onnxruntime_ENABLE_CPUINFO)
set(CPUINFO_SUPPORTED TRUE)
endif()
if (WIN32)
- # Exclude Windows ARM build and Windows Store
- if (${onnxruntime_target_platform} MATCHES "^(ARM.*|arm.*)$" )
- message(WARNING "Cpuinfo not included for compilation problems with Windows ARM.")
- set(CPUINFO_SUPPORTED FALSE)
- elseif (WIN32 AND NOT CMAKE_CXX_STANDARD_LIBRARIES MATCHES kernel32.lib)
- message(WARNING "Cpuinfo not included non-Desktop builds")
- set(CPUINFO_SUPPORTED FALSE)
- endif()
+ set(CPUINFO_SUPPORTED TRUE)
elseif (NOT ${onnxruntime_target_platform} MATCHES "^(i[3-6]86|AMD64|x86(_64)?|armv[5-8].*|aarch64|arm64)$")
message(WARNING
"Target processor architecture \"${onnxruntime_target_platform}\" is not supported in cpuinfo. "
@@ -304,13 +332,23 @@ if (CPUINFO_SUPPORTED)
set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE INTERNAL "")
set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE INTERNAL "")
set(CPUINFO_BUILD_BENCHMARKS OFF CACHE INTERNAL "")
-
- FetchContent_Declare(
- pytorch_cpuinfo
- URL ${DEP_URL_pytorch_cpuinfo}
- URL_HASH SHA1=${DEP_SHA1_pytorch_cpuinfo}
- FIND_PACKAGE_ARGS NAMES cpuinfo
- )
+ if(onnxruntime_target_platform STREQUAL "ARM64EC")
+ message("Applying a patch for Windows ARM64EC in cpuinfo")
+ FetchContent_Declare(
+ pytorch_cpuinfo
+ URL ${DEP_URL_pytorch_cpuinfo}
+ URL_HASH SHA1=${DEP_SHA1_pytorch_cpuinfo}
+ PATCH_COMMAND ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch
+ FIND_PACKAGE_ARGS NAMES cpuinfo
+ )
+ else()
+ FetchContent_Declare(
+ pytorch_cpuinfo
+ URL ${DEP_URL_pytorch_cpuinfo}
+ URL_HASH SHA1=${DEP_SHA1_pytorch_cpuinfo}
+ FIND_PACKAGE_ARGS NAMES cpuinfo
+ )
+ endif()
set(ONNXRUNTIME_CPUINFO_PROJ pytorch_cpuinfo)
endif()
@@ -536,22 +574,32 @@ if(onnxruntime_ENABLE_TRAINING OR (onnxruntime_ENABLE_TRAINING_APIS AND onnxrunt
onnxruntime_fetchcontent_makeavailable(cxxopts)
endif()
+if (onnxruntime_USE_COREML)
+ FetchContent_Declare(
+ coremltools
+ URL ${DEP_URL_coremltools}
+ URL_HASH SHA1=${DEP_SHA1_coremltools}
+ PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/coremltools/crossplatformbuild.patch
+ )
+ # we don't build directly so use Populate. selected files are built from onnxruntime_providers_coreml.cmake
+ FetchContent_Populate(coremltools)
+endif()
+
message("Finished fetching external dependencies")
set(onnxruntime_LINK_DIRS )
if (onnxruntime_USE_CUDA)
#TODO: combine onnxruntime_CUDNN_HOME and onnxruntime_CUDA_HOME, assume they are the same
+ find_package(CUDAToolkit REQUIRED)
if (WIN32)
if(onnxruntime_CUDNN_HOME)
list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib ${onnxruntime_CUDNN_HOME}/lib/x64)
endif()
- list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/x64/lib64)
else()
if(onnxruntime_CUDNN_HOME)
list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib ${onnxruntime_CUDNN_HOME}/lib64)
endif()
- list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/lib64)
endif()
endif()
@@ -562,4 +610,3 @@ endif()
FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR} ORT_BINARY_DIR)
FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR} ORT_SOURCE_DIR)
-
diff --git a/cmake/external/xnnpack.cmake b/cmake/external/xnnpack.cmake
index e661aa51bfc17..41f02ce6f22bc 100644
--- a/cmake/external/xnnpack.cmake
+++ b/cmake/external/xnnpack.cmake
@@ -6,10 +6,14 @@ set(FP16_BUILD_BENCHMARKS OFF CACHE INTERNAL "")
set(PTHREADPOOL_BUILD_TESTS OFF CACHE INTERNAL "")
set(PTHREADPOOL_BUILD_BENCHMARKS OFF CACHE INTERNAL "")
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^riscv64.*")
+ set(XNNPACK_USE_SYSTEM_LIBS OFF)
+endif()
+
# BF16 instructions cause ICE in Android NDK compiler
if(CMAKE_ANDROID_ARCH_ABI STREQUAL armeabi-v7a)
set(XNNPACK_ENABLE_ARM_BF16 OFF)
-ENDIF()
+endif()
# fp16 depends on psimd
FetchContent_Declare(psimd URL ${DEP_URL_psimd} URL_HASH SHA1=${DEP_SHA1_psimd})
diff --git a/cmake/maccatalyst_prepare_objects_for_prelink.py b/cmake/maccatalyst_prepare_objects_for_prelink.py
new file mode 100644
index 0000000000000..34664b4e05237
--- /dev/null
+++ b/cmake/maccatalyst_prepare_objects_for_prelink.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import shutil
+import sys
+
+
+# Note: This script is mainly used for sanity checking/validating the files in the .a library equal to the .o files
+# in the source dir to handle the case of source files having duplicate names under different subdirectories for
+# each onnxruntime library. (Only applicable when doing a Mac Catalyst build.)
+def main():
+ source_dir = sys.argv[1]
+ dest_dir = sys.argv[2]
+ files_from_static_lib = sys.argv[3]
+ files_from_source_dir = []
+ for subdir, _, files in os.walk(source_dir):
+ for file_name in files:
+ if file_name.endswith(".o"):
+ files_from_source_dir.append(file_name.strip())
+ dest_name_without_extension, _ = os.path.splitext(file_name)
+ counter = 0
+
+ dest_file = f"{dest_name_without_extension}.o"
+ while os.path.exists(os.path.join(dest_dir, dest_file)):
+ print("Duplicate file name from source: " + os.path.join(source_dir, subdir, file_name))
+ counter += 1
+ dest_file = f"{dest_name_without_extension}_{counter}.o"
+ print("Renamed file name in destination: " + os.path.join(dest_dir, dest_file))
+
+ destination_path = os.path.join(dest_dir, dest_file)
+ source_file = os.path.join(source_dir, subdir, file_name)
+ shutil.copy(source_file, destination_path)
+
+ # Sanity check to ensure the number of .o object from the original cmake source directory matches with the number
+ # of .o files extracted from each .a onnxruntime library
+ file_lists_from_static_lib = []
+ with open(files_from_static_lib) as file:
+ filenames = file.readlines()
+ for filename in filenames:
+ file_lists_from_static_lib.append(filename.strip())
+
+ sorted_list1 = sorted(file_lists_from_static_lib)
+ sorted_list2 = sorted(files_from_source_dir)
+
+ if len(sorted_list1) != len(sorted_list2):
+ print(
+ "Caught a mismatch in the number of .o object files from the original cmake source directory: ",
+ len(sorted_list1),
+ "the number of .o files extracted from the static onnxruntime lib: ",
+ len(sorted_list2),
+ "for: ",
+ os.path.basename(source_dir),
+ )
+
+ if sorted_list1 == sorted_list2:
+ print(
+ "Sanity check passed: object files from original source directory matches with files extracted "
+ "from static library for: ",
+ os.path.basename(source_dir),
+ )
+ else:
+ print(
+ "Error: Mismatch between object files from original source directory "
+ "and the .o files extracted from static library for: ",
+ os.path.basename(source_dir),
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index c900f4d4b09a5..e15c8a046dc20 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -189,7 +189,6 @@ set(onnxruntime_INTERNAL_LIBRARIES
${PROVIDERS_SNPE}
${PROVIDERS_TVM}
${PROVIDERS_RKNPU}
- ${PROVIDERS_VITISAI}
${PROVIDERS_XNNPACK}
${PROVIDERS_WEBNN}
${PROVIDERS_AZURE}
@@ -282,7 +281,13 @@ endif()
# Assemble the Apple static framework (iOS and macOS)
if(onnxruntime_BUILD_APPLE_FRAMEWORK)
- set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT})
+ # when building for mac catalyst, the CMAKE_OSX_SYSROOT is set to MacOSX as well, to avoid duplication,
+ # we specify as `-macabi` in the name of the output static apple framework directory.
+ if (PLATFORM_NAME STREQUAL "macabi")
+ set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-macabi)
+ else()
+ set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT})
+ endif()
# Setup the various directories required. Remove any existing ones so we start with a clean directory.
set(STATIC_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/static_libraries)
@@ -300,18 +305,34 @@ if(onnxruntime_BUILD_APPLE_FRAMEWORK)
# to enforce symbol visibility. doing it this way limits the symbols included from the .a files to symbols used
# by the ORT .o files.
- # If it's an onnxruntime library, extract .o files to a separate directory for each library to avoid any clashes
- # with filenames (e.g. utils.o)
+ # If it's an onnxruntime library, extract .o files from the original cmake build path to a separate directory for
+ # each library to avoid any clashes with filenames (e.g. utils.o)
foreach(_LIB ${onnxruntime_INTERNAL_LIBRARIES} )
GET_TARGET_PROPERTY(_LIB_TYPE ${_LIB} TYPE)
if(_LIB_TYPE STREQUAL "STATIC_LIBRARY")
set(CUR_STATIC_LIB_OBJ_DIR ${STATIC_LIB_TEMP_DIR}/$)
add_custom_command(TARGET onnxruntime POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${CUR_STATIC_LIB_OBJ_DIR})
-
- add_custom_command(TARGET onnxruntime POST_BUILD
- COMMAND ar ARGS -x $
- WORKING_DIRECTORY ${CUR_STATIC_LIB_OBJ_DIR})
+ if (PLATFORM_NAME STREQUAL "macabi")
+ # There exists several duplicate names for source files under different subdirectories within
+ # each onnxruntime library. (e.g. onnxruntime/contrib_ops/cpu/element_wise_ops.o
+ # vs. onnxruntime/providers/core/cpu/math/element_wise_ops.o)
+ # In that case, using 'ar ARGS -x' to extract the .o files from .a lib would possibly cause duplicate naming files being overwritten
+ # and lead to missing undefined symbol error in the generated binary.
+ # So we use the below python script as a sanity check to do a recursive find of all .o files in ${CUR_TARGET_CMAKE_SOURCE_LIB_DIR}
+ # and verifies that matches the content of the .a, and then copy from the source dir.
+ # TODO: The copying action here isn't really necessary. For future fix, consider using the script extracts from the ar with the rename to potentially
+ # make both maccatalyst and other builds do the same thing.
+ set(CUR_TARGET_CMAKE_SOURCE_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_LIB}.dir)
+ add_custom_command(TARGET onnxruntime POST_BUILD
+ COMMAND ar -t $ | grep "\.o$" > ${_LIB}.object_file_list.txt
+ COMMAND ${CMAKE_COMMAND} -E env python3 ${CMAKE_CURRENT_SOURCE_DIR}/maccatalyst_prepare_objects_for_prelink.py ${CUR_TARGET_CMAKE_SOURCE_LIB_DIR} ${CUR_STATIC_LIB_OBJ_DIR} ${CUR_STATIC_LIB_OBJ_DIR}/${_LIB}.object_file_list.txt
+ WORKING_DIRECTORY ${CUR_STATIC_LIB_OBJ_DIR})
+ else()
+ add_custom_command(TARGET onnxruntime POST_BUILD
+ COMMAND ar ARGS -x $
+ WORKING_DIRECTORY ${CUR_STATIC_LIB_OBJ_DIR})
+ endif()
endif()
endforeach()
diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake
index 43d5fa9bdee34..896379d743441 100644
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@@ -71,6 +71,12 @@ if(onnxruntime_target_platform STREQUAL "ARM64EC")
endif()
endif()
+if(onnxruntime_target_platform STREQUAL "ARM64")
+ if (MSVC)
+ add_compile_options("/bigobj")
+ endif()
+endif()
+
file(GLOB onnxruntime_common_src CONFIGURE_DEPENDS
${onnxruntime_common_src_patterns}
)
@@ -129,7 +135,7 @@ target_include_directories(onnxruntime_common
${OPTIONAL_LITE_INCLUDE_DIR})
-target_link_libraries(onnxruntime_common PUBLIC safeint_interface ${GSL_TARGET} ${ABSEIL_LIBS})
+target_link_libraries(onnxruntime_common PUBLIC safeint_interface ${GSL_TARGET} ${ABSEIL_LIBS} date::date)
add_dependencies(onnxruntime_common ${onnxruntime_EXTERNAL_DEPENDENCIES})
@@ -189,6 +195,8 @@ elseif(NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
set(ARM TRUE)
elseif(dumpmachine_output MATCHES "^aarch64.*")
set(ARM64 TRUE)
+ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^riscv64.*")
+ set(RISCV64 TRUE)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
set(X86 TRUE)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
@@ -198,11 +206,7 @@ elseif(NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
endif()
-if (ARM64 OR ARM OR X86 OR X64 OR X86_64)
- if((WIN32 AND NOT CMAKE_CXX_STANDARD_LIBRARIES MATCHES kernel32.lib) OR ((ARM64 OR ARM) AND MSVC))
- # msvc compiler report syntax error with cpuinfo arm source files
- # and cpuinfo does not have code for getting arm uarch info under windows
- else()
+if (RISCV64 OR ARM64 OR ARM OR X86 OR X64 OR X86_64)
# Link cpuinfo if supported
# Using it mainly in ARM with Android.
# Its functionality in detecting x86 cpu features are lacking, so is support for Windows.
@@ -210,7 +214,6 @@ if (ARM64 OR ARM OR X86 OR X64 OR X86_64)
onnxruntime_add_include_to_target(onnxruntime_common cpuinfo::cpuinfo)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES cpuinfo::cpuinfo ${ONNXRUNTIME_CLOG_TARGET_NAME})
endif()
- endif()
endif()
if (NOT onnxruntime_BUILD_SHARED_LIB)
diff --git a/cmake/onnxruntime_compile_triton_kernel.cmake b/cmake/onnxruntime_compile_triton_kernel.cmake
index f59cc6de108bc..9ecb8cf93265c 100644
--- a/cmake/onnxruntime_compile_triton_kernel.cmake
+++ b/cmake/onnxruntime_compile_triton_kernel.cmake
@@ -4,10 +4,12 @@
find_package(Python3 COMPONENTS Interpreter REQUIRED)
# set all triton kernel ops that need to be compiled
-set(triton_kernel_scripts
- "onnxruntime/core/providers/rocm/math/softmax_triton.py"
- "onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py"
-)
+if(onnxruntime_USE_ROCM)
+ set(triton_kernel_scripts
+ "onnxruntime/core/providers/rocm/math/softmax_triton.py"
+ "onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py"
+ )
+endif()
function(compile_triton_kernel out_triton_kernel_obj_file out_triton_kernel_header_dir)
# compile triton kernel, generate .a and .h files
diff --git a/cmake/onnxruntime_graph.cmake b/cmake/onnxruntime_graph.cmake
index 3f532ec2c3261..4d51325b8414e 100644
--- a/cmake/onnxruntime_graph.cmake
+++ b/cmake/onnxruntime_graph.cmake
@@ -7,8 +7,26 @@ file(GLOB_RECURSE onnxruntime_graph_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/graph/*.cc"
)
-# create empty list for any excludes
+# start with empty training srcs list
+set(orttraining_graph_src)
+
+if (onnxruntime_ENABLE_TRAINING_OPS AND NOT onnxruntime_ENABLE_TRAINING)
+ set(orttraining_graph_src
+ "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.cc"
+ "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.h"
+ )
+endif()
+
+if (onnxruntime_ENABLE_TRAINING)
+ file(GLOB_RECURSE orttraining_graph_src CONFIGURE_DEPENDS
+ "${ORTTRAINING_SOURCE_DIR}/core/graph/*.h"
+ "${ORTTRAINING_SOURCE_DIR}/core/graph/*.cc"
+ )
+endif()
+
+# create empty lists for any excludes
set(onnxruntime_graph_src_exclude_patterns)
+set(orttraining_graph_src_exclude_patterns)
if (onnxruntime_MINIMAL_BUILD)
# remove schema registration support
@@ -22,11 +40,18 @@ if (onnxruntime_MINIMAL_BUILD)
"${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/onnx_function_util.cc"
"${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/shape_inference_functions.h"
"${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/shape_inference_functions.cc"
+ "${ONNXRUNTIME_ROOT}/core/graph/dml_ops/dml_defs.h"
+ "${ONNXRUNTIME_ROOT}/core/graph/dml_ops/dml_defs.cc"
"${ONNXRUNTIME_ROOT}/core/graph/function_template.h"
"${ONNXRUNTIME_ROOT}/core/graph/function_utils.h"
"${ONNXRUNTIME_ROOT}/core/graph/function_utils.cc"
)
+ list(APPEND orttraining_graph_src_exclude_patterns
+ "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.h"
+ "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.cc"
+ )
+
# no Function support initially
list(APPEND onnxruntime_graph_src_exclude_patterns
"${ONNXRUNTIME_ROOT}/core/graph/function*"
@@ -64,30 +89,12 @@ endif()
file(GLOB onnxruntime_graph_src_exclude ${onnxruntime_graph_src_exclude_patterns})
list(REMOVE_ITEM onnxruntime_graph_src ${onnxruntime_graph_src_exclude})
-file(GLOB_RECURSE onnxruntime_ir_defs_src CONFIGURE_DEPENDS
- "${ONNXRUNTIME_ROOT}/core/defs/*.cc"
-)
-
-if (onnxruntime_ENABLE_TRAINING_OPS AND NOT onnxruntime_ENABLE_TRAINING)
- set(orttraining_graph_src
- "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.cc"
- "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.h"
- )
-endif()
-
-if (onnxruntime_ENABLE_TRAINING)
- file(GLOB_RECURSE orttraining_graph_src CONFIGURE_DEPENDS
- "${ORTTRAINING_SOURCE_DIR}/core/graph/*.h"
- "${ORTTRAINING_SOURCE_DIR}/core/graph/*.cc"
- )
-endif()
-
-set(onnxruntime_graph_lib_src ${onnxruntime_graph_src} ${onnxruntime_ir_defs_src})
if (onnxruntime_ENABLE_TRAINING_OPS)
- list(APPEND onnxruntime_graph_lib_src ${orttraining_graph_src})
+ file(GLOB orttraining_graph_src_exclude ${orttraining_graph_src_exclude_patterns})
+ list(REMOVE_ITEM orttraining_graph_src ${orttraining_graph_src_exclude})
endif()
-onnxruntime_add_static_library(onnxruntime_graph ${onnxruntime_graph_lib_src})
+onnxruntime_add_static_library(onnxruntime_graph ${onnxruntime_graph_src} ${orttraining_graph_src})
add_dependencies(onnxruntime_graph onnx_proto flatbuffers::flatbuffers)
onnxruntime_add_include_to_target(onnxruntime_graph onnxruntime_common ${WIL_TARGET} onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers safeint_interface Boost::mp11)
@@ -120,7 +127,7 @@ endif()
set_target_properties(onnxruntime_graph PROPERTIES FOLDER "ONNXRuntime")
set_target_properties(onnxruntime_graph PROPERTIES LINKER_LANGUAGE CXX)
-source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_graph_src} ${onnxruntime_ir_defs_src})
+source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_graph_src})
if (onnxruntime_ENABLE_TRAINING_OPS)
source_group(TREE ${ORTTRAINING_ROOT} FILES ${orttraining_graph_src})
endif()
diff --git a/cmake/onnxruntime_ios.toolchain.cmake b/cmake/onnxruntime_ios.toolchain.cmake
index 750e4118ca1fc..f2106f2423bf2 100644
--- a/cmake/onnxruntime_ios.toolchain.cmake
+++ b/cmake/onnxruntime_ios.toolchain.cmake
@@ -2,6 +2,7 @@
# Licensed under the MIT License.
set(CMAKE_SYSTEM_NAME iOS)
+
if (NOT DEFINED CMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM AND NOT DEFINED CMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY)
set(CMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED NO)
endif()
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 26e4380af4c23..304aa77f5473c 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -1,7 +1,9 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
-set(MLAS_SRC_DIR ${ONNXRUNTIME_ROOT}/core/mlas/lib)
+set(MLAS_ROOT ${ONNXRUNTIME_ROOT}/core/mlas)
+set(MLAS_SRC_DIR ${MLAS_ROOT}/lib)
+set(MLAS_INC_DIR ${MLAS_ROOT}/inc)
#
# All hardware agnostic source files here
@@ -9,6 +11,7 @@ set(MLAS_SRC_DIR ${ONNXRUNTIME_ROOT}/core/mlas/lib)
# multi-target build
#
onnxruntime_add_static_library(onnxruntime_mlas
+ ${MLAS_SRC_DIR}/mlasi.h
${MLAS_SRC_DIR}/platform.cpp
${MLAS_SRC_DIR}/threading.cpp
${MLAS_SRC_DIR}/sgemm.cpp
@@ -33,7 +36,17 @@ onnxruntime_add_static_library(onnxruntime_mlas
${MLAS_SRC_DIR}/qpostprocessor.cpp
${MLAS_SRC_DIR}/qlgavgpool.cpp
${MLAS_SRC_DIR}/qdwconv_kernelsize.cpp
+ ${MLAS_SRC_DIR}/sqnbitgemm.h
${MLAS_SRC_DIR}/sqnbitgemm.cpp
+ ${MLAS_SRC_DIR}/sqnbitgemm_q8_block.h
+)
+
+target_sources(onnxruntime_mlas PRIVATE
+ ${MLAS_INC_DIR}/mlas_float16.h
+ ${MLAS_INC_DIR}/mlas_gemm_postprocessor.h
+ ${MLAS_INC_DIR}/mlas_q4.h
+ ${MLAS_INC_DIR}/mlas_qnbit.h
+ ${MLAS_INC_DIR}/mlas.h
)
if (NOT onnxruntime_ORT_MINIMAL_BUILD)
@@ -134,10 +147,6 @@ function(setup_mlas_source_for_windows)
target_sources(onnxruntime_mlas PRIVATE
${MLAS_SRC_DIR}/arm/sgemmc.cpp
)
- # it should be removed after Visual Stuio is upgraded to 17.7
- if (MSVC)
- add_compile_options("-d2SSAOptimizer-")
- endif()
elseif(onnxruntime_target_platform STREQUAL "x64")
file(GLOB_RECURSE mlas_platform_srcs_avx CONFIGURE_DEPENDS
@@ -159,6 +168,9 @@ function(setup_mlas_source_for_windows)
${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
${MLAS_SRC_DIR}/qgemm_kernel_sse41.cpp
${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
+ ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
+ ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512.cpp
+ ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512vnni.cpp
${MLAS_SRC_DIR}/amd64/QgemmU8S8KernelAmx.asm
${MLAS_SRC_DIR}/amd64/QgemmU8S8KernelAvx2.asm
${MLAS_SRC_DIR}/amd64/QgemmU8U8KernelAvx2.asm
@@ -189,6 +201,7 @@ function(setup_mlas_source_for_windows)
${MLAS_SRC_DIR}/amd64/sgemma.asm
${MLAS_SRC_DIR}/amd64/cvtfp16a.asm
${MLAS_SRC_DIR}/amd64/SoftmaxKernelAvx.asm
+ ${MLAS_SRC_DIR}/amd64/SoftmaxKernelAvx512F.asm
${MLAS_SRC_DIR}/amd64/TransKernelFma3.asm
${MLAS_SRC_DIR}/amd64/TransKernelAvx512F.asm
${MLAS_SRC_DIR}/amd64/LogisticKernelFma3.asm
@@ -200,7 +213,6 @@ function(setup_mlas_source_for_windows)
${MLAS_SRC_DIR}/q4gemm_avx512.cpp
)
endif()
-
else()
target_sources(onnxruntime_mlas PRIVATE
${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
@@ -292,8 +304,8 @@ else()
if(APPLE)
get_target_property(ONNXRUNTIME_MLAS_MACOSX_ARCH onnxruntime_mlas OSX_ARCHITECTURES)
endif()
- list(LENGTH ONNXRUNTIME_MLAS_MACOSX_ARCH ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGH)
- if(ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGH GREATER 1)
+ list(LENGTH ONNXRUNTIME_MLAS_MACOSX_ARCH ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGTH)
+ if(ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGTH GREATER 1)
set(ONNXRUNTIME_MLAS_MULTI_ARCH TRUE)
endif()
#If ONNXRUNTIME_MLAS_MULTI_ARCH is true, we need to go through every if branch below
@@ -340,25 +352,31 @@ else()
${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
)
+ set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
+ PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
if (NOT APPLE)
set(mlas_platform_srcs
${mlas_platform_srcs}
${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S
${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S
${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUmmla.S
+ ${MLAS_SRC_DIR}/aarch64/SbgemmKernelNeon.S
${MLAS_SRC_DIR}/activate_fp16.cpp
${MLAS_SRC_DIR}/dwconv.cpp
${MLAS_SRC_DIR}/halfgemm_kernel_neon.cpp
${MLAS_SRC_DIR}/pooling_fp16.cpp
${MLAS_SRC_DIR}/qgemm_kernel_smmla.cpp
${MLAS_SRC_DIR}/qgemm_kernel_ummla.cpp
+ ${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp
)
set_source_files_properties(${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
+ set_source_files_properties(${MLAS_SRC_DIR}/aarch64/SbgemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+bf16 ")
set_source_files_properties(${MLAS_SRC_DIR}/activate_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
set_source_files_properties(${MLAS_SRC_DIR}/dwconv.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
set_source_files_properties(${MLAS_SRC_DIR}/pooling_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
+ set_source_files_properties(${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+bf16 ")
endif()
if(ONNXRUNTIME_MLAS_MULTI_ARCH)
@@ -516,6 +534,7 @@ else()
${MLAS_SRC_DIR}/x86_64/ErfKernelFma3.S
${MLAS_SRC_DIR}/intrinsics/avx2/qladd_avx2.cpp
${MLAS_SRC_DIR}/intrinsics/avx2/qdwconv_avx2.cpp
+ ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
)
set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
@@ -523,6 +542,7 @@ else()
${MLAS_SRC_DIR}/x86_64/DgemmKernelAvx512F.S
${MLAS_SRC_DIR}/x86_64/SgemmKernelAvx512F.S
${MLAS_SRC_DIR}/x86_64/SconvKernelAvx512F.S
+ ${MLAS_SRC_DIR}/x86_64/SoftmaxKernelAvx512F.S
${MLAS_SRC_DIR}/x86_64/SpoolKernelAvx512F.S
${MLAS_SRC_DIR}/x86_64/TransKernelAvx512F.S
${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
@@ -534,9 +554,15 @@ else()
${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvx512Vnni.S
${MLAS_SRC_DIR}/x86_64/QgemmU8X8KernelAvx512Core.S
${MLAS_SRC_DIR}/x86_64/ConvSymKernelAvx512Core.S
+ ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512.cpp
)
set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mavx512bw -mavx512dq -mavx512vl")
+ set(mlas_platform_srcs_avx512vnni
+ ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512vnni.cpp
+ )
+ set_source_files_properties(${mlas_platform_srcs_avx512vnni} PROPERTIES COMPILE_FLAGS "-mfma -mavx512vnni -mavx512bw -mavx512dq -mavx512vl -mavx512f")
+
set(mlas_platform_srcs
${MLAS_SRC_DIR}/activate_fp16.cpp
${MLAS_SRC_DIR}/dwconv.cpp
@@ -548,6 +574,7 @@ else()
${mlas_platform_srcs_avx2}
${mlas_platform_srcs_avx512f}
${mlas_platform_srcs_avx512core}
+ ${mlas_platform_srcs_avx512vnni}
)
if (NOT onnxruntime_ORT_MINIMAL_BUILD)
@@ -566,7 +593,7 @@ else()
)
set_source_files_properties(${MLAS_SRC_DIR}/qgemm_kernel_amx.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mavx512bw -mavx512dq -mavx512vl -mavx512f")
set_source_files_properties(${MLAS_SRC_DIR}/x86_64/QgemmU8S8KernelAmx.S PROPERTIES COMPILE_FLAGS "-mavx2 -mavx512bw -mavx512dq -mavx512vl -mavx512f")
- endif()
+ endif()
if(ONNXRUNTIME_MLAS_MULTI_ARCH)
onnxruntime_add_static_library(onnxruntime_mlas_x86_64 ${mlas_platform_srcs})
@@ -605,10 +632,12 @@ else()
endif()
foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
- target_include_directories(${mlas_target} PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${MLAS_SRC_DIR})
+ target_include_directories(${mlas_target} PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})
+
+ set_target_properties(${mlas_target} PROPERTIES FOLDER "ONNXRuntime")
endforeach()
-set_target_properties(onnxruntime_mlas PROPERTIES FOLDER "ONNXRuntime")
+
if (WIN32)
target_compile_options(onnxruntime_mlas PRIVATE "$<$:/wd6385>" "$<$:/wd4127>")
if (onnxruntime_ENABLE_STATIC_ANALYSIS)
@@ -616,6 +645,12 @@ if (WIN32)
endif()
endif()
+if (PLATFORM_NAME STREQUAL "macabi")
+ # Needed for maccatalyst C compilation
+ # i.e. the flags below add "--target=x86_64-apple-ios14.0-macabi -ffunction-sections -fdata-sections"
+ target_compile_options(onnxruntime_mlas PRIVATE ${CMAKE_C_FLAGS})
+endif()
+
if (NOT onnxruntime_BUILD_SHARED_LIB)
install(TARGETS onnxruntime_mlas
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
@@ -624,6 +659,21 @@ if (NOT onnxruntime_BUILD_SHARED_LIB)
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()
+# set up source group for MLAS source files
+block()
+ set(source_group_srcs)
+ foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
+ get_target_property(mlas_target_srcs ${mlas_target} SOURCES)
+ foreach(mlas_target_src ${mlas_target_srcs})
+ cmake_path(IS_PREFIX MLAS_ROOT ${mlas_target_src} in_mlas_root)
+ if(in_mlas_root)
+ list(APPEND source_group_srcs ${mlas_target_src})
+ endif()
+ endforeach()
+ endforeach()
+ source_group(TREE ${MLAS_ROOT} FILES ${source_group_srcs})
+endblock()
+
if (NOT onnxruntime_ORT_MINIMAL_BUILD)
@@ -635,7 +685,7 @@ if (NOT onnxruntime_ORT_MINIMAL_BUILD)
onnxruntime_add_executable(onnxruntime_mlas_q4dq
${MLAS_SRC_DIR}/q4_dq_cli.cpp
)
- target_include_directories(onnxruntime_mlas_q4dq PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${MLAS_SRC_DIR})
+ target_include_directories(onnxruntime_mlas_q4dq PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
set_target_properties(onnxruntime_mlas_q4dq PROPERTIES FOLDER "ONNXRuntimeTest")
target_link_libraries(onnxruntime_mlas_q4dq PRIVATE ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
diff --git a/cmake/onnxruntime_nodejs.cmake b/cmake/onnxruntime_nodejs.cmake
index 6053b9d1088cd..f11928c11cf14 100644
--- a/cmake/onnxruntime_nodejs.cmake
+++ b/cmake/onnxruntime_nodejs.cmake
@@ -73,6 +73,9 @@ endif()
if (onnxruntime_USE_COREML)
set(NODEJS_BINDING_USE_COREML "--use_coreml")
endif()
+if (onnxruntime_USE_QNN)
+ set(NODEJS_BINDING_USE_QNN "--use_qnn")
+endif()
if(NOT onnxruntime_ENABLE_STATIC_ANALYSIS)
# add custom target
@@ -88,9 +91,9 @@ add_custom_target(js_common_npm_ci ALL
add_custom_target(nodejs_binding_wrapper ALL
COMMAND ${NPM_CLI} ci
- COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE}
+ COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE} --onnxruntime-generator=${CMAKE_GENERATOR}
--arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_TENSORRT}
- ${NODEJS_BINDING_USE_COREML}
+ ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
WORKING_DIRECTORY ${JS_NODE_ROOT}
COMMENT "Using cmake-js to build OnnxRuntime Node.js binding")
diff --git a/cmake/onnxruntime_optimizer.cmake b/cmake/onnxruntime_optimizer.cmake
index 6f09583199ffd..3bae1b8a48e0f 100644
--- a/cmake/onnxruntime_optimizer.cmake
+++ b/cmake/onnxruntime_optimizer.cmake
@@ -19,6 +19,8 @@ if (onnxruntime_MINIMAL_BUILD)
"${ONNXRUNTIME_ROOT}/core/optimizer/graph_transformer_utils.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/initializer.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/initializer.h"
+ "${ONNXRUNTIME_ROOT}/core/optimizer/matmul_nbits_fusion.cc"
+ "${ONNXRUNTIME_ROOT}/core/optimizer/matmul_nbits_fusion.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/nhwc_transformer.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/nhwc_transformer.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/qdq_final_cleanup.cc"
@@ -111,6 +113,7 @@ onnxruntime_add_include_to_target(onnxruntime_optimizer onnxruntime_common onnxr
target_include_directories(onnxruntime_optimizer PRIVATE ${ONNXRUNTIME_ROOT})
if (onnxruntime_ENABLE_TRAINING)
target_include_directories(onnxruntime_optimizer PRIVATE ${ORTTRAINING_ROOT})
+ onnxruntime_add_include_to_target(onnxruntime_optimizer nlohmann_json::nlohmann_json)
if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
onnxruntime_add_include_to_target(onnxruntime_optimizer Python::Module)
endif()
@@ -130,3 +133,7 @@ if (NOT onnxruntime_BUILD_SHARED_LIB)
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()
+
+if (onnxruntime_USE_ROCM)
+ add_dependencies(onnxruntime_optimizer generate_hipified_files)
+endif()
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 8d3ea403fb74b..7e7819ac31a19 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -66,11 +66,7 @@ if(onnxruntime_USE_CUDA)
set(PROVIDERS_CUDA onnxruntime_providers_cuda)
endif()
if(onnxruntime_USE_COREML)
- if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" OR CMAKE_SYSTEM_NAME STREQUAL "iOS")
- set(PROVIDERS_COREML onnxruntime_providers_coreml onnxruntime_coreml_proto)
- else()
- set(PROVIDERS_COREML onnxruntime_providers_coreml)
- endif()
+ set(PROVIDERS_COREML onnxruntime_providers_coreml coreml_proto)
endif()
if(onnxruntime_USE_NNAPI_BUILTIN)
set(PROVIDERS_NNAPI onnxruntime_providers_nnapi)
diff --git a/cmake/onnxruntime_providers_coreml.cmake b/cmake/onnxruntime_providers_coreml.cmake
index aa8c35526b274..0aa25a221bf27 100644
--- a/cmake/onnxruntime_providers_coreml.cmake
+++ b/cmake/onnxruntime_providers_coreml.cmake
@@ -1,107 +1,222 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
- if (onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD)
- message(FATAL_ERROR "CoreML EP can not be used in a basic minimal build. Please build with '--minimal_build extended'")
- endif()
+if (onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD)
+ message(FATAL_ERROR "CoreML EP can not be used in a basic minimal build. Please build with '--minimal_build extended'")
+endif()
+
+add_compile_definitions(USE_COREML=1)
- add_compile_definitions(USE_COREML=1)
-
- # Compile CoreML proto definition to ${CMAKE_CURRENT_BINARY_DIR}/coreml
- if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" OR CMAKE_SYSTEM_NAME STREQUAL "iOS")
- set(COREML_PROTO_ROOT ${PROJECT_SOURCE_DIR}/../onnxruntime/core/providers/coreml/mlmodel_format)
- file(GLOB coreml_proto_srcs
- "${COREML_PROTO_ROOT}/*.proto"
- )
- onnxruntime_add_static_library(onnxruntime_coreml_proto ${coreml_proto_srcs})
- target_include_directories(onnxruntime_coreml_proto PUBLIC $ "${CMAKE_CURRENT_BINARY_DIR}")
- target_compile_definitions(onnxruntime_coreml_proto PUBLIC $)
- set_target_properties(onnxruntime_coreml_proto PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
- set_target_properties(onnxruntime_coreml_proto PROPERTIES COMPILE_FLAGS "-fvisibility-inlines-hidden")
- set(_src_sub_dir "coreml/")
- onnxruntime_protobuf_generate(
- APPEND_PATH
- GEN_SRC_SUB_DIR ${_src_sub_dir}
- IMPORT_DIRS ${COREML_PROTO_ROOT}
- TARGET onnxruntime_coreml_proto
- )
-
- if (NOT onnxruntime_BUILD_SHARED_LIB)
- install(TARGETS onnxruntime_coreml_proto
- ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
- LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
- RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
- FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR}
- )
- endif()
+# Check if we can build the coremltools code for creating an mlpackage with an mlprogram.
+# The coremltools source requires std::filesystem::path which is only available from iOS 13 on.
+set(_enable_ML_PROGRAM ON)
+if (IOS AND CMAKE_OSX_DEPLOYMENT_TARGET VERSION_LESS 13.0)
+ message(WARNING "CoreML ML Program is not supported on iOS < 13.0. Excluding ML Program support from build.")
+ set(_enable_ML_PROGRAM OFF)
+elseif(LINUX)
+ # uuid-dev is required. we don't bother installing on CIs as it's really for manual developer testing.
+ find_library(LibUUID_LIBRARY NAMES uuid)
+ find_path(LibUUID_INCLUDE_DIR NAMES uuid/uuid.h)
+ if (NOT LibUUID_INCLUDE_DIR)
+ message(STATUS "uuid/uuid.h was not found as is required for ML Program support. "
+ "Run `sudo apt install uuid-dev` if you need to test ML Program related CoreML EP code. ")
+ set(_enable_ML_PROGRAM OFF)
endif()
+endif()
+
+if (_enable_ML_PROGRAM)
+ add_compile_definitions(COREML_ENABLE_MLPROGRAM=1)
+endif()
+
+# Compile CoreML proto definition to ${CMAKE_CURRENT_BINARY_DIR}/coreml_proto
+set(COREML_PROTO_ROOT ${coremltools_SOURCE_DIR}/mlmodel/format)
+file(GLOB coreml_proto_srcs "${COREML_PROTO_ROOT}/*.proto")
+
+onnxruntime_add_static_library(coreml_proto ${coreml_proto_srcs})
+target_include_directories(coreml_proto
+ PUBLIC $
+ "${CMAKE_CURRENT_BINARY_DIR}")
+target_compile_definitions(coreml_proto
+ PUBLIC $)
+set_target_properties(coreml_proto PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
+set_target_properties(coreml_proto PROPERTIES COMPILE_FLAGS "-fvisibility-inlines-hidden")
- # These are shared utils,
- # TODO, move this to a separated lib when used by EPs other than NNAPI and CoreML
- file(GLOB_RECURSE onnxruntime_providers_shared_utils_cc_srcs CONFIGURE_DEPENDS
- "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.h"
- "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.cc"
+set(_src_sub_dir "coreml_proto/")
+onnxruntime_protobuf_generate(
+ APPEND_PATH
+ GEN_SRC_SUB_DIR ${_src_sub_dir}
+ IMPORT_DIRS ${COREML_PROTO_ROOT}
+ TARGET coreml_proto
+)
+
+if (NOT onnxruntime_BUILD_SHARED_LIB)
+ install(TARGETS coreml_proto
+ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+ FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR}
)
+endif()
+
+# Add the .proto and generated .cc/.h files to the External/coreml_proto folder in Visual Studio.
+# Separate source_group for each as the .proto files are in the repo and the .cc/.h files are generated in the build
+# output directory.
+set_target_properties(coreml_proto PROPERTIES FOLDER "External")
+source_group(TREE ${COREML_PROTO_ROOT} PREFIX coreml_proto FILES ${coreml_proto_srcs})
+
+# filter to the generated .cc/.h files
+get_target_property(coreml_proto_generated_srcs coreml_proto SOURCES)
+list(FILTER coreml_proto_generated_srcs INCLUDE REGEX "\.pb\.(h|cc)$")
+source_group(TREE ${CMAKE_CURRENT_BINARY_DIR} PREFIX coreml_proto_generated FILES ${coreml_proto_generated_srcs})
+
+# These are shared utils,
+# TODO, move this to a separate lib when used by EPs other than NNAPI and CoreML
+file(GLOB onnxruntime_providers_shared_utils_cc_srcs CONFIGURE_DEPENDS
+ "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.h"
+ "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.cc"
+)
+file(GLOB onnxruntime_providers_coreml_public_headers CONFIGURE_DEPENDS
+ "${ONNXRUNTIME_INCLUDE_DIR}/core/providers/coreml/*.h"
+)
+
+file(GLOB
+ onnxruntime_providers_coreml_cc_srcs_top CONFIGURE_DEPENDS
+ "${ONNXRUNTIME_ROOT}/core/providers/coreml/*.h"
+ "${ONNXRUNTIME_ROOT}/core/providers/coreml/*.cc"
+)
+
+# Add builder source code
+file(GLOB_RECURSE
+ onnxruntime_providers_coreml_cc_srcs_nested CONFIGURE_DEPENDS
+ "${ONNXRUNTIME_ROOT}/core/providers/coreml/builders/*.h"
+ "${ONNXRUNTIME_ROOT}/core/providers/coreml/builders/*.cc"
+)
+
+if(_enable_ML_PROGRAM)
+ # Add helpers to create mlpackage weights. limit to just the files we need to minimize the changes to make them
+ # build on Windows and Linux.
file(GLOB
- onnxruntime_providers_coreml_cc_srcs_top CONFIGURE_DEPENDS
- "${ONNXRUNTIME_ROOT}/core/providers/coreml/*.h"
- "${ONNXRUNTIME_ROOT}/core/providers/coreml/*.cc"
+ onnxruntime_providers_coreml_milblob_cc_srcs CONFIGURE_DEPENDS
+ "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/*.hpp"
+ "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/*.cpp"
+ "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Util/*.hpp"
+ "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Blob/BlobDataType.hpp"
+ "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Blob/StorageFormat.hpp"
+ "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Blob/FileWriter.?pp"
+ "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Blob/StorageWriter.?pp"
)
- # Add builder source code
- file(GLOB_RECURSE
- onnxruntime_providers_coreml_cc_srcs_nested CONFIGURE_DEPENDS
- "${ONNXRUNTIME_ROOT}/core/providers/coreml/builders/*.h"
- "${ONNXRUNTIME_ROOT}/core/providers/coreml/builders/*.cc"
+ # Add helpers to create mlpackage
+ file(GLOB
+ onnxruntime_providers_coreml_modelpackage_cc_srcs CONFIGURE_DEPENDS
+ "${coremltools_SOURCE_DIR}/modelpackage/src/ModelPackage.?pp"
+ "${coremltools_SOURCE_DIR}/modelpackage/src/utils/JsonMap.?pp"
)
- if (NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND NOT CMAKE_SYSTEM_NAME STREQUAL "iOS")
- list(REMOVE_ITEM onnxruntime_providers_coreml_cc_srcs_nested
- "${ONNXRUNTIME_ROOT}/core/providers/coreml/builders/model_builder.h"
- "${ONNXRUNTIME_ROOT}/core/providers/coreml/builders/model_builder.cc"
- )
- endif()
-
- # Add CoreML objective c++ source code
- if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" OR CMAKE_SYSTEM_NAME STREQUAL "iOS")
- file(GLOB
- onnxruntime_providers_coreml_objcc_srcs CONFIGURE_DEPENDS
- "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/model.h"
- "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/model.mm"
- "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/host_utils.h"
- "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/host_utils.mm"
- )
- endif()
- set(onnxruntime_providers_coreml_cc_srcs
- ${onnxruntime_providers_coreml_cc_srcs_top}
- ${onnxruntime_providers_coreml_cc_srcs_nested}
- ${onnxruntime_providers_shared_utils_cc_srcs}
+ set(coremltools_srcs
+ ${onnxruntime_providers_coreml_milblob_cc_srcs}
+ ${onnxruntime_providers_coreml_modelpackage_cc_srcs}
)
- source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_coreml_cc_srcs})
- onnxruntime_add_static_library(onnxruntime_providers_coreml
- ${onnxruntime_providers_coreml_cc_srcs} ${onnxruntime_providers_coreml_objcc_srcs}
+ source_group(TREE ${coremltools_SOURCE_DIR} PREFIX coremltools FILES ${coremltools_srcs})
+endif()
+
+# Add CoreML objective c++ source code
+if (APPLE)
+ file(GLOB
+ onnxruntime_providers_coreml_objcc_srcs CONFIGURE_DEPENDS
+ "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/host_utils.h"
+ "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/host_utils.mm"
+ "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/model.h"
+ "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/model.mm"
+ "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/objc_str_utils.h"
+ "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/objc_str_utils.mm"
)
- onnxruntime_add_include_to_target(onnxruntime_providers_coreml
- onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface
+else()
+ # add the Model implementation that uses the protobuf types but excludes any actual CoreML dependencies
+ # by using stub implementations on non-Apple platforms.
+ file(GLOB
+ onnxruntime_providers_coreml_objcc_srcs CONFIGURE_DEPENDS
+ "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/host_utils.h"
+ "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/host_utils_stub.cc"
+ "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/model.h"
+ "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/model_stub.cc"
)
- if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" OR CMAKE_SYSTEM_NAME STREQUAL "iOS")
- onnxruntime_add_include_to_target(onnxruntime_providers_coreml onnxruntime_coreml_proto)
- target_link_libraries(onnxruntime_providers_coreml PRIVATE onnxruntime_coreml_proto "-framework Foundation" "-framework CoreML")
- add_dependencies(onnxruntime_providers_coreml onnxruntime_coreml_proto)
+endif()
+
+set(onnxruntime_providers_coreml_cc_srcs
+ ${onnxruntime_providers_coreml_cc_srcs_top}
+ ${onnxruntime_providers_coreml_cc_srcs_nested}
+ ${onnxruntime_providers_shared_utils_cc_srcs}
+ ${onnxruntime_providers_coreml_objcc_srcs}
+)
+
+source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_providers_coreml_cc_srcs})
+source_group(TREE ${ONNXRUNTIME_INCLUDE_DIR} FILES ${onnxruntime_providers_coreml_public_headers})
+
+onnxruntime_add_static_library(onnxruntime_providers_coreml
+ ${onnxruntime_providers_coreml_public_headers}
+ ${onnxruntime_providers_coreml_cc_srcs}
+ ${coremltools_srcs}
+)
+
+onnxruntime_add_include_to_target(onnxruntime_providers_coreml
+ onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11
+ safeint_interface
+)
+
+onnxruntime_add_include_to_target(onnxruntime_providers_coreml coreml_proto)
+target_link_libraries(onnxruntime_providers_coreml PRIVATE coreml_proto)
+add_dependencies(onnxruntime_providers_coreml coreml_proto)
+
+if (APPLE)
+ target_compile_definitions(onnxruntime_providers_coreml PRIVATE __APPLE__)
+endif()
+
+if (_enable_ML_PROGRAM)
+ # Setup coremltools fp16 and json dependencies for creating an mlpackage.
+ #
+ # These are also used by external/xnnpack.cmake. fp16 depends on psimd
+ FetchContent_Declare(psimd URL ${DEP_URL_psimd} URL_HASH SHA1=${DEP_SHA1_psimd})
+ onnxruntime_fetchcontent_makeavailable(psimd)
+ set(PSIMD_SOURCE_DIR ${psimd_SOURCE_DIR})
+ FetchContent_Declare(fp16 URL ${DEP_URL_fp16} URL_HASH SHA1=${DEP_SHA1_fp16})
+ set(FP16_BUILD_TESTS OFF CACHE INTERNAL "")
+ set(FP16_BUILD_BENCHMARKS OFF CACHE INTERNAL "")
+ onnxruntime_fetchcontent_makeavailable(fp16)
+
+ # need to tweak the include paths to match what the coreml source code expects
+ target_include_directories(onnxruntime_providers_coreml PRIVATE
+ ${fp16_SOURCE_DIR}/include
+ ${nlohmann_json_SOURCE_DIR}/single_include/nlohmann
+ ${coremltools_SOURCE_DIR}
+ ${coremltools_SOURCE_DIR}/mlmodel/src/
+ ${coremltools_SOURCE_DIR}/modelpackage/src/
+ )
+
+ add_dependencies(onnxruntime_providers_coreml nlohmann_json::nlohmann_json fp16)
+
+ if (LINUX)
+ target_link_libraries(onnxruntime_providers_coreml PRIVATE uuid)
endif()
- add_dependencies(onnxruntime_providers_coreml ${onnxruntime_EXTERNAL_DEPENDENCIES})
-
- set_target_properties(onnxruntime_providers_coreml PROPERTIES CXX_STANDARD_REQUIRED ON)
- set_target_properties(onnxruntime_providers_coreml PROPERTIES FOLDER "ONNXRuntime")
- target_include_directories(onnxruntime_providers_coreml PRIVATE ${ONNXRUNTIME_ROOT} ${coreml_INCLUDE_DIRS})
- set_target_properties(onnxruntime_providers_coreml PROPERTIES LINKER_LANGUAGE CXX)
-
- if (NOT onnxruntime_BUILD_SHARED_LIB)
- install(TARGETS onnxruntime_providers_coreml
- ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
- LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
- RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
- FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
- endif()
\ No newline at end of file
+endif()
+
+if (APPLE)
+ target_link_libraries(onnxruntime_providers_coreml PRIVATE "-framework Foundation" "-framework CoreML")
+endif()
+
+add_dependencies(onnxruntime_providers_coreml ${onnxruntime_EXTERNAL_DEPENDENCIES})
+
+set_target_properties(onnxruntime_providers_coreml PROPERTIES CXX_STANDARD_REQUIRED ON)
+set_target_properties(onnxruntime_providers_coreml PROPERTIES FOLDER "ONNXRuntime")
+target_include_directories(onnxruntime_providers_coreml PRIVATE ${ONNXRUNTIME_ROOT} ${coreml_INCLUDE_DIRS})
+set_target_properties(onnxruntime_providers_coreml PROPERTIES LINKER_LANGUAGE CXX)
+
+if (NOT onnxruntime_BUILD_SHARED_LIB)
+ install(TARGETS onnxruntime_providers_coreml
+ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+ FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index f60faa4d39116..b211c02f712bd 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -60,6 +60,15 @@ if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
"${ONNXRUNTIME_ROOT}/contrib_ops/cpu/aten_ops/aten_op_executor.cc"
)
endif()
+ set(onnxruntime_cpu_neural_speed_srcs
+ "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_wrapper.h"
+ "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_defs.h"
+ "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_gemm.cc"
+ "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_gemm.h"
+ )
+ if(NOT USE_NEURAL_SPEED)
+ list(REMOVE_ITEM onnxruntime_cpu_contrib_ops_srcs ${onnxruntime_cpu_neural_speed_srcs})
+ endif()
# add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio
source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_cpu_contrib_ops_srcs})
list(APPEND onnxruntime_providers_src ${onnxruntime_cpu_contrib_ops_srcs})
@@ -144,6 +153,12 @@ if (HAS_BITWISE_INSTEAD_OF_LOGICAL)
target_compile_options(onnxruntime_providers PRIVATE "-Wno-bitwise-instead-of-logical")
endif()
+if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
+ if(USE_NEURAL_SPEED)
+ onnxruntime_add_include_to_target(onnxruntime_providers neural_speed::bestla)
+ endif()
+endif()
+
if (MSVC)
target_compile_options(onnxruntime_providers PRIVATE "/bigobj")
# if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
@@ -208,7 +223,7 @@ set_target_properties(onnxruntime_providers PROPERTIES LINKER_LANGUAGE CXX)
set_target_properties(onnxruntime_providers PROPERTIES FOLDER "ONNXRuntime")
if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
- AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin|iOS"
+ AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin|iOS|visionOS"
AND NOT CMAKE_SYSTEM_NAME STREQUAL "Android"
AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
file(GLOB onnxruntime_providers_shared_cc_srcs CONFIGURE_DEPENDS
@@ -258,4 +273,4 @@ if (NOT onnxruntime_BUILD_SHARED_LIB)
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
-endif()
\ No newline at end of file
+endif()
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 84d1376f99d5e..46bc5fb3bd1ac 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -1,10 +1,25 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
- file(GLOB_RECURSE onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS
- "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h"
- "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
- )
+
+ if (onnxruntime_CUDA_MINIMAL)
+ file(GLOB onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS
+ "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h"
+ "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
+ "${ONNXRUNTIME_ROOT}/core/providers/cuda/tunable/*.h"
+ "${ONNXRUNTIME_ROOT}/core/providers/cuda/tunable/*.cc"
+ )
+ # Remove pch files
+ list(REMOVE_ITEM onnxruntime_providers_cuda_cc_srcs
+ "${ONNXRUNTIME_ROOT}/core/providers/cuda/integer_gemm.cc"
+ "${ONNXRUNTIME_ROOT}/core/providers/cuda/triton_kernel.h"
+ )
+ else()
+ file(GLOB_RECURSE onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS
+ "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h"
+ "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
+ )
+ endif()
# Remove pch files
list(REMOVE_ITEM onnxruntime_providers_cuda_cc_srcs
"${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_pch.h"
@@ -16,11 +31,16 @@
"${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
"${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
)
- file(GLOB_RECURSE onnxruntime_providers_cuda_cu_srcs CONFIGURE_DEPENDS
- "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cu"
- "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cuh"
- )
+
+ if (onnxruntime_CUDA_MINIMAL)
+ set(onnxruntime_providers_cuda_shared_srcs "")
+ else()
+ file(GLOB_RECURSE onnxruntime_providers_cuda_cu_srcs CONFIGURE_DEPENDS
+ "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cu"
+ "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cuh"
+ )
+ endif()
source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})
set(onnxruntime_providers_cuda_src ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})
@@ -102,7 +122,7 @@
endif()
if(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
# cuda_provider_interface.cc is removed from the object target: onnxruntime_providers_cuda_obj and
- # add to the lib onnxruntime_providers_cuda separatedly.
+ # added to the lib onnxruntime_providers_cuda separately.
# onnxruntime_providers_cuda_ut can share all the object files with onnxruntime_providers_cuda except cuda_provider_interface.cc.
set(cuda_provider_interface_src ${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_provider_interface.cc)
list(REMOVE_ITEM onnxruntime_providers_cuda_src ${cuda_provider_interface_src})
@@ -121,18 +141,22 @@
if (HAS_GUARD_CF)
target_compile_options(${target} PRIVATE "$<$:SHELL:-Xcompiler /guard:cf>")
endif()
+
if (HAS_QSPECTRE)
target_compile_options(${target} PRIVATE "$<$:SHELL:-Xcompiler /Qspectre>")
endif()
+
foreach(ORT_FLAG ${ORT_WARNING_FLAGS})
target_compile_options(${target} PRIVATE "$<$:SHELL:-Xcompiler \"${ORT_FLAG}\">")
endforeach()
+
# CUDA 11.3+ supports parallel compilation
# https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver-threads
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.3)
- option(onnxruntime_NVCC_THREADS "Number of threads that NVCC can use for compilation." 1)
+ set(onnxruntime_NVCC_THREADS "1" CACHE STRING "Number of threads that NVCC can use for compilation.")
target_compile_options(${target} PRIVATE "$<$:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
endif()
+
if (UNIX)
target_compile_options(${target} PRIVATE "$<$:SHELL:-Xcompiler -Wno-reorder>"
"$<$>:-Wno-reorder>")
@@ -142,6 +166,13 @@
#mutex.cuh(91): warning C4834: discarding return value of function with 'nodiscard' attribute
target_compile_options(${target} PRIVATE "$<$:SHELL:-Xcompiler /wd4834>")
target_compile_options(${target} PRIVATE "$<$:SHELL:-Xcompiler /wd4127>")
+ if (MSVC)
+ # the VS warnings for 'Conditional Expression is Constant' are spurious as they don't handle multiple conditions
+ # e.g. `if (std::is_same_v && not_a_const)` will generate the warning even though constexpr cannot
+ # be used due to `&& not_a_const`. This affects too many places for it to be reasonable to disable at a finer
+ # granularity.
+ target_compile_options(${target} PRIVATE "$<$:/wd4127>")
+ endif()
endif()
onnxruntime_add_include_to_target(${target} onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers)
@@ -156,10 +187,16 @@
endif()
add_dependencies(${target} onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
- target_link_libraries(${target} PRIVATE cublasLt cublas cudnn curand cufft ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface)
- if(onnxruntime_CUDNN_HOME)
- target_include_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/include)
- target_link_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/lib)
+ if(onnxruntime_CUDA_MINIMAL)
+ target_compile_definitions(${target} PRIVATE USE_CUDA_MINIMAL)
+ target_link_libraries(${target} PRIVATE ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface CUDA::cudart)
+ else()
+ target_link_libraries(${target} PRIVATE CUDA::cublasLt CUDA::cublas cudnn CUDA::curand CUDA::cufft CUDA::cudart
+ ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface)
+ if(onnxruntime_CUDNN_HOME)
+ target_include_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/include)
+ target_link_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/lib)
+ endif()
endif()
if (onnxruntime_USE_TRITON_KERNEL)
@@ -171,25 +208,24 @@
target_include_directories(${target} PRIVATE ${triton_kernel_header_dir})
target_link_libraries(${target} PUBLIC -Wl,--whole-archive ${triton_kernel_obj_file} -Wl,--no-whole-archive)
# lib cuda needed by cuLaunchKernel
- target_link_libraries(${target} PRIVATE cuda)
+ target_link_libraries(${target} PRIVATE CUDA::cuda_driver)
endif()
include(cutlass)
- target_include_directories(${target} PRIVATE ${cutlass_SOURCE_DIR}/include ${cutlass_SOURCE_DIR}/examples)
+ target_include_directories(${target} PRIVATE ${cutlass_SOURCE_DIR}/include ${cutlass_SOURCE_DIR}/examples ${cutlass_SOURCE_DIR}/tools/util/include)
- target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+ target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES}
+ PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
# ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
set_target_properties(${target} PROPERTIES LINKER_LANGUAGE CUDA)
set_target_properties(${target} PROPERTIES FOLDER "ONNXRuntime")
if (onnxruntime_ENABLE_CUDA_PROFILING) # configure cupti for cuda profiling
- target_include_directories(${target} PRIVATE ${onnxruntime_CUDA_HOME}/extras/CUPTI/include)
- target_link_directories(${target} PRIVATE ${onnxruntime_CUDA_HOME}/extras/CUPTI/lib64)
- target_link_libraries(${target} PRIVATE cupti)
+ target_link_libraries(${target} PRIVATE CUDA::cupti)
endif()
- if (onnxruntime_ENABLE_NVTX_PROFILE AND NOT WIN32)
- target_link_libraries(${target} PRIVATE nvToolsExt)
+ if (onnxruntime_ENABLE_NVTX_PROFILE)
+ target_link_libraries(${target} PRIVATE CUDA::nvtx3)
endif()
if (onnxruntime_ENABLE_TRAINING_OPS)
diff --git a/cmake/onnxruntime_providers_dml.cmake b/cmake/onnxruntime_providers_dml.cmake
index 01b0bda9fea6b..439be882dcc5e 100644
--- a/cmake/onnxruntime_providers_dml.cmake
+++ b/cmake/onnxruntime_providers_dml.cmake
@@ -62,7 +62,7 @@
target_link_libraries(onnxruntime_providers_dml PRIVATE delayimp.lib)
if (NOT GDK_PLATFORM)
- set(onnxruntime_DELAYLOAD_FLAGS "${onnxruntime_DELAYLOAD_FLAGS} /DELAYLOAD:DirectML.dll /DELAYLOAD:d3d12.dll /DELAYLOAD:dxgi.dll /DELAYLOAD:api-ms-win-core-com-l1-1-0.dll /DELAYLOAD:shlwapi.dll /DELAYLOAD:oleaut32.dll /DELAYLOAD:ext-ms-win-dxcore-l1-*.dll /ignore:4199")
+ set(onnxruntime_DELAYLOAD_FLAGS "${onnxruntime_DELAYLOAD_FLAGS} /DELAYLOAD:DirectML.dll /DELAYLOAD:d3d12.dll /DELAYLOAD:dxgi.dll /DELAYLOAD:dxcore.dll /DELAYLOAD:api-ms-win-core-com-l1-1-0.dll /DELAYLOAD:shlwapi.dll /DELAYLOAD:oleaut32.dll /DELAYLOAD:ext-ms-win-dxcore-l1-*.dll /ignore:4199")
endif()
target_compile_definitions(onnxruntime_providers_dml
diff --git a/cmake/onnxruntime_providers_migraphx.cmake b/cmake/onnxruntime_providers_migraphx.cmake
index 91ac66a40721d..01c4f8b2c8719 100644
--- a/cmake/onnxruntime_providers_migraphx.cmake
+++ b/cmake/onnxruntime_providers_migraphx.cmake
@@ -49,7 +49,7 @@
target_compile_options(onnxruntime_providers_migraphx PRIVATE -Wno-error=sign-compare)
set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/migraphx/version_script.lds -Xlinker --gc-sections")
- target_link_libraries(onnxruntime_providers_migraphx PRIVATE nsync::nsync_cpp stdc++fs)
+ target_link_libraries(onnxruntime_providers_migraphx PRIVATE nsync::nsync_cpp)
include(CheckLibraryExists)
check_library_exists(migraphx::c "migraphx_program_run_async" "/opt/rocm/migraphx/lib" HAS_STREAM_SYNC)
diff --git a/cmake/onnxruntime_providers_nnapi.cmake b/cmake/onnxruntime_providers_nnapi.cmake
index 5ac25a3b76efb..b718a976eb26f 100644
--- a/cmake/onnxruntime_providers_nnapi.cmake
+++ b/cmake/onnxruntime_providers_nnapi.cmake
@@ -49,12 +49,10 @@
endif()
# These are shared utils,
- # TODO, move this to a separated lib when used by EPs other than NNAPI and CoreML
+ # TODO, move this to a separate lib when used by EPs other than NNAPI and CoreML
list(APPEND onnxruntime_provider_nnapi_cc_src_patterns
"${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.h"
"${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.cc"
- "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.h"
- "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.cc"
)
file(GLOB onnxruntime_providers_nnapi_cc_srcs CONFIGURE_DEPENDS ${onnxruntime_provider_nnapi_cc_src_patterns})
@@ -81,4 +79,4 @@
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
- endif()
\ No newline at end of file
+ endif()
diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
index e26f0bfc0b751..5876b2b5c448b 100644
--- a/cmake/onnxruntime_providers_openvino.cmake
+++ b/cmake/onnxruntime_providers_openvino.cmake
@@ -16,23 +16,19 @@
endif()
# Header paths
- find_package(InferenceEngine REQUIRED)
- find_package(ngraph REQUIRED)
-
- if (OPENVINO_2022_1 OR OPENVINO_2022_2)
find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX)
- list (OV_20_LIBS openvino::frontend::onnx openvino::runtime)
+ if(OpenVINO_VERSION VERSION_LESS 2023.0)
+ message(FATAL_ERROR "OpenVINO 2023.0 and newer are supported. Please, latest OpenVINO release")
endif()
if (WIN32)
unset(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO)
endif()
+ list(APPEND OPENVINO_LIB_LIST openvino::frontend::onnx openvino::runtime ${PYTHON_LIBRARIES})
if ((DEFINED ENV{OPENCL_LIBS}) AND (DEFINED ENV{OPENCL_INCS}))
add_definitions(-DIO_BUFFER_ENABLED=1)
- list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS} ${OV_20_LIBS} ${InferenceEngine_LIBRARIES} ${NGRAPH_LIBRARIES} ngraph::onnx_importer ${PYTHON_LIBRARIES})
- else()
- list(APPEND OPENVINO_LIB_LIST ${OV_20_LIBS} ${InferenceEngine_LIBRARIES} ${NGRAPH_LIBRARIES} ngraph::onnx_importer ${PYTHON_LIBRARIES})
+ list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS})
endif()
source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_openvino_cc_srcs})
@@ -75,7 +71,14 @@
message(FATAL_ERROR "onnxruntime_providers_openvino unknown platform, need to specify shared library exports for it")
endif()
- install(TARGETS onnxruntime_providers_openvino
- ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
- LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
- RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
\ No newline at end of file
+ if (CMAKE_OPENVINO_LIBRARY_INSTALL_DIR)
+ install(TARGETS onnxruntime_providers_openvino
+ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ LIBRARY DESTINATION ${CMAKE_OPENVINO_LIBRARY_INSTALL_DIR}
+ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ else()
+ install(TARGETS onnxruntime_providers_openvino
+ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ endif()
diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index a93a06e960c81..b68d84c23bb32 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -4,12 +4,10 @@
add_compile_definitions(USE_QNN=1)
# These are shared utils,
- # TODO, move this to a separated lib when used by EPs other than QNN, NNAPI and CoreML
- file(GLOB_RECURSE onnxruntime_providers_shared_utils_cc_srcs CONFIGURE_DEPENDS
+ # TODO, move to a separate lib when used by EPs other than QNN, NNAPI and CoreML
+ file(GLOB onnxruntime_providers_shared_utils_cc_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.h"
"${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.cc"
- "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.h"
- "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.cc"
)
file(GLOB_RECURSE
@@ -42,4 +40,4 @@
# ignore the warning unknown-pragmas on "pragma region"
if(NOT MSVC)
target_compile_options(onnxruntime_providers_qnn PRIVATE "-Wno-unknown-pragmas")
- endif()
\ No newline at end of file
+ endif()
diff --git a/cmake/onnxruntime_providers_tensorrt.cmake b/cmake/onnxruntime_providers_tensorrt.cmake
index 686a993de3a4a..e56de0c7124dc 100644
--- a/cmake/onnxruntime_providers_tensorrt.cmake
+++ b/cmake/onnxruntime_providers_tensorrt.cmake
@@ -8,7 +8,7 @@
set(BUILD_LIBRARY_ONLY 1)
add_definitions("-DONNX_ML=1")
add_definitions("-DONNX_NAMESPACE=onnx")
- set(CUDA_INCLUDE_DIRS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+ set(CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
set(TENSORRT_ROOT ${onnxruntime_TENSORRT_HOME})
set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
set(PROTOBUF_LIBRARY ${PROTOBUF_LIB})
@@ -34,31 +34,97 @@
MESSAGE(STATUS "[Note] There is an issue when running \"Debug build\" TRT EP with \"Release build\" TRT built-in parser on Windows. This build will use tensorrt oss parser instead.")
endif()
+ find_path(TENSORRT_INCLUDE_DIR NvInfer.h
+ HINTS ${TENSORRT_ROOT}
+ PATH_SUFFIXES include)
+
+
+ file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h NVINFER_VER_CONTENT)
+ string(REGEX MATCH "define NV_TENSORRT_MAJOR * +([0-9]+)" NV_TENSORRT_MAJOR "${NVINFER_VER_CONTENT}")
+ string(REGEX REPLACE "define NV_TENSORRT_MAJOR * +([0-9]+)" "\\1" NV_TENSORRT_MAJOR "${NV_TENSORRT_MAJOR}")
+ string(REGEX MATCH "define NV_TENSORRT_MINOR * +([0-9]+)" NV_TENSORRT_MINOR "${NVINFER_VER_CONTENT}")
+ string(REGEX REPLACE "define NV_TENSORRT_MINOR * +([0-9]+)" "\\1" NV_TENSORRT_MINOR "${NV_TENSORRT_MINOR}")
+ string(REGEX MATCH "define NV_TENSORRT_PATCH * +([0-9]+)" NV_TENSORRT_PATCH "${NVINFER_VER_CONTENT}")
+ string(REGEX REPLACE "define NV_TENSORRT_PATCH * +([0-9]+)" "\\1" NV_TENSORRT_PATCH "${NV_TENSORRT_PATCH}")
+ math(EXPR NV_TENSORRT_MAJOR_INT "${NV_TENSORRT_MAJOR}")
+ math(EXPR NV_TENSORRT_MINOR_INT "${NV_TENSORRT_MINOR}")
+ math(EXPR NV_TENSORRT_PATCH_INT "${NV_TENSORRT_PATCH}")
+
+ if (NV_TENSORRT_MAJOR)
+ MESSAGE(STATUS "NV_TENSORRT_MAJOR is ${NV_TENSORRT_MAJOR}")
+ else()
+ MESSAGE(STATUS "Can't find NV_TENSORRT_MAJOR macro")
+ endif()
+
+ # Check TRT version >= 10.0.1.6
+ if ((NV_TENSORRT_MAJOR_INT GREATER 10) OR
+ (NV_TENSORRT_MAJOR_INT EQUAL 10 AND NV_TENSORRT_MINOR_INT GREATER 0) OR
+ (NV_TENSORRT_MAJOR_INT EQUAL 10 AND NV_TENSORRT_PATCH_INT GREATER 0))
+ set(TRT_GREATER_OR_EQUAL_TRT_10_GA ON)
+ endif()
+
+ # TensorRT 10 GA onwards, the TensorRT libraries will have major version appended to the end on Windows,
+ # for example, nvinfer_10.dll, nvinfer_plugin_10.dll, nvonnxparser_10.dll ...
+ if (WIN32 AND TRT_GREATER_OR_EQUAL_TRT_10_GA)
+ set(NVINFER_LIB "nvinfer_${NV_TENSORRT_MAJOR}")
+ set(NVINFER_PLUGIN_LIB "nvinfer_plugin_${NV_TENSORRT_MAJOR}")
+ set(PARSER_LIB "nvonnxparser_${NV_TENSORRT_MAJOR}")
+ endif()
+
+ if (NOT NVINFER_LIB)
+ set(NVINFER_LIB "nvinfer")
+ endif()
+
+ if (NOT NVINFER_PLUGIN_LIB)
+ set(NVINFER_PLUGIN_LIB "nvinfer_plugin")
+ endif()
+
+ if (NOT PARSER_LIB)
+ set(PARSER_LIB "nvonnxparser")
+ endif()
+
+ MESSAGE(STATUS "Looking for ${NVINFER_LIB} and ${NVINFER_PLUGIN_LIB}")
+
+ find_library(TENSORRT_LIBRARY_INFER ${NVINFER_LIB}
+ HINTS ${TENSORRT_ROOT}
+ PATH_SUFFIXES lib lib64 lib/x64)
+
+ if (NOT TENSORRT_LIBRARY_INFER)
+ MESSAGE(STATUS "Can't find ${NVINFER_LIB}")
+ endif()
+
+ find_library(TENSORRT_LIBRARY_INFER_PLUGIN ${NVINFER_PLUGIN_LIB}
+ HINTS ${TENSORRT_ROOT}
+ PATH_SUFFIXES lib lib64 lib/x64)
+
+ if (NOT TENSORRT_LIBRARY_INFER_PLUGIN)
+ MESSAGE(STATUS "Can't find ${NVINFER_PLUGIN_LIB}")
+ endif()
+
if (onnxruntime_USE_TENSORRT_BUILTIN_PARSER)
- # Add TensorRT library
- find_path(TENSORRT_INCLUDE_DIR NvInfer.h
- HINTS ${TENSORRT_ROOT}
- PATH_SUFFIXES include)
- MESSAGE(STATUS "Found TensorRT headers at ${TENSORRT_INCLUDE_DIR}")
- find_library(TENSORRT_LIBRARY_INFER nvinfer
- HINTS ${TENSORRT_ROOT}
- PATH_SUFFIXES lib lib64 lib/x64)
- find_library(TENSORRT_LIBRARY_INFER_PLUGIN nvinfer_plugin
- HINTS ${TENSORRT_ROOT}
- PATH_SUFFIXES lib lib64 lib/x64)
- find_library(TENSORRT_LIBRARY_NVONNXPARSER nvonnxparser
+ MESSAGE(STATUS "Looking for ${PARSER_LIB}")
+
+ find_library(TENSORRT_LIBRARY_NVONNXPARSER ${PARSER_LIB}
HINTS ${TENSORRT_ROOT}
PATH_SUFFIXES lib lib64 lib/x64)
+
+ if (NOT TENSORRT_LIBRARY_NVONNXPARSER)
+ MESSAGE(STATUS "Can't find ${PARSER_LIB}")
+ endif()
+
set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER} ${TENSORRT_LIBRARY_INFER_PLUGIN} ${TENSORRT_LIBRARY_NVONNXPARSER})
MESSAGE(STATUS "Find TensorRT libs at ${TENSORRT_LIBRARY}")
else()
+ if (TRT_GREATER_OR_EQUAL_TRT_10_GA)
+ set(ONNX_USE_LITE_PROTO ON)
+ endif()
FetchContent_Declare(
onnx_tensorrt
URL ${DEP_URL_onnx_tensorrt}
URL_HASH SHA1=${DEP_SHA1_onnx_tensorrt}
)
if (NOT CUDA_INCLUDE_DIR)
- set(CUDA_INCLUDE_DIR ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # onnx-tensorrt repo needs this variable to build
+ set(CUDA_INCLUDE_DIR ${CUDAToolkit_INCLUDE_DIRS}) # onnx-tensorrt repo needs this variable to build
endif()
# The onnx_tensorrt repo contains a test program, getSupportedAPITest, which doesn't support Windows. It uses
# unistd.h. So we must exclude it from our build. onnxruntime_fetchcontent_makeavailable is for the purpose.
@@ -73,17 +139,22 @@
unset(PROTOBUF_LIBRARY)
unset(OLD_CMAKE_CXX_FLAGS)
unset(OLD_CMAKE_CUDA_FLAGS)
- set_target_properties(nvonnxparser PROPERTIES LINK_FLAGS "/ignore:4199")
+ set_target_properties(${PARSER_LIB} PROPERTIES LINK_FLAGS "/ignore:4199")
target_compile_options(nvonnxparser_static PRIVATE /FIio.h /wd4100)
- target_compile_options(nvonnxparser PRIVATE /FIio.h /wd4100)
+ target_compile_options(${PARSER_LIB} PRIVATE /FIio.h /wd4100)
endif()
+ # Static libraries are just nvonnxparser_static on all platforms
set(onnxparser_link_libs nvonnxparser_static)
+ set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER} ${TENSORRT_LIBRARY_INFER_PLUGIN})
+ MESSAGE(STATUS "Find TensorRT libs at ${TENSORRT_LIBRARY}")
endif()
include_directories(${TENSORRT_INCLUDE_DIR})
# ${TENSORRT_LIBRARY} is empty if we link nvonnxparser_static.
# nvonnxparser_static is linked against tensorrt libraries in onnx-tensorrt
# See https://github.com/onnx/onnx-tensorrt/blob/8af13d1b106f58df1e98945a5e7c851ddb5f0791/CMakeLists.txt#L121
+ # However, starting from TRT 10 GA, nvonnxparser_static doesn't link against tensorrt libraries.
+ # Therefore, the above code finds ${TENSORRT_LIBRARY_INFER} and ${TENSORRT_LIBRARY_INFER_PLUGIN}.
set(trt_link_libs cudnn cublas ${CMAKE_DL_LIBS} ${TENSORRT_LIBRARY})
file(GLOB_RECURSE onnxruntime_providers_tensorrt_cc_srcs CONFIGURE_DEPENDS
@@ -102,11 +173,12 @@
onnxruntime_add_include_to_target(onnxruntime_providers_tensorrt onnxruntime_common onnx flatbuffers::flatbuffers Boost::mp11 safeint_interface)
add_dependencies(onnxruntime_providers_tensorrt onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
if (onnxruntime_USE_TENSORRT_BUILTIN_PARSER)
- target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${trt_link_libs} cudart ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS})
+ target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
else()
- target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} cudart ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS})
+ target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
endif()
- target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+ target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS}
+ PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
if(onnxruntime_CUDNN_HOME)
target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${onnxruntime_CUDNN_HOME}/include)
endif()
@@ -134,7 +206,7 @@
elseif(UNIX)
set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/tensorrt/version_script.lds -Xlinker --gc-sections")
- target_link_libraries(onnxruntime_providers_tensorrt PRIVATE nsync::nsync_cpp stdc++fs)
+ target_link_libraries(onnxruntime_providers_tensorrt PRIVATE nsync::nsync_cpp)
elseif(WIN32)
set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/tensorrt/symbols.def")
else()
diff --git a/cmake/onnxruntime_providers_vitisai.cmake b/cmake/onnxruntime_providers_vitisai.cmake
index 0951c2d02664d..3e848e1fd44a0 100644
--- a/cmake/onnxruntime_providers_vitisai.cmake
+++ b/cmake/onnxruntime_providers_vitisai.cmake
@@ -14,14 +14,19 @@
"${ONNXRUNTIME_ROOT}/core/providers/vitisai/*.h"
"${ONNXRUNTIME_ROOT}/core/providers/vitisai/imp/*.cc"
"${ONNXRUNTIME_ROOT}/core/providers/vitisai/imp/*.h"
+ "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
+ "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
)
source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_vitisai_cc_srcs})
- onnxruntime_add_static_library(onnxruntime_providers_vitisai ${onnxruntime_providers_vitisai_cc_srcs})
- onnxruntime_add_include_to_target(onnxruntime_providers_vitisai onnxruntime_common onnxruntime_framework onnx onnx_proto)
- target_link_libraries(onnxruntime_providers_vitisai PRIVATE onnx protobuf::libprotobuf nlohmann_json::nlohmann_json)
- if(NOT MSVC)
- target_compile_options(onnxruntime_providers_vitisai PUBLIC $<$:-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0>)
- endif(NOT MSVC)
+ onnxruntime_add_shared_library(onnxruntime_providers_vitisai ${onnxruntime_providers_vitisai_cc_srcs})
+ onnxruntime_add_include_to_target(onnxruntime_providers_vitisai ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} nlohmann_json::nlohmann_json safeint_interface flatbuffers::flatbuffers)
+ target_link_libraries(onnxruntime_providers_vitisai PRIVATE ${ONNXRUNTIME_PROVIDERS_SHARED})
+ if(MSVC)
+ onnxruntime_add_include_to_target(onnxruntime_providers_vitisai dbghelp)
+ set_property(TARGET onnxruntime_providers_vitisai APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/vitisai/symbols.def")
+ else(MSVC)
+ set_property(TARGET onnxruntime_providers_vitisai APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/vitisai/version_script.lds -Xlinker --gc-sections")
+ endif(MSVC)
target_include_directories(onnxruntime_providers_vitisai PRIVATE "${ONNXRUNTIME_ROOT}/core/providers/vitisai/include" ${XRT_INCLUDE_DIRS} ${CMAKE_CURRENT_BINARY_DIR}/VitisAI)
if(MSVC)
@@ -30,17 +35,18 @@
target_compile_options(onnxruntime_providers_vitisai PRIVATE "/wd4251")
# for unused formal parameter
target_compile_options(onnxruntime_providers_vitisai PRIVATE "/wd4100")
+ # for type name first seen using 'class' now seen using 'struct'
+ target_compile_options(onnxruntime_providers_vitisai PRIVATE "/wd4099")
else(MSVC)
+ target_compile_options(onnxruntime_providers_vitisai PUBLIC $<$:-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0>)
target_compile_options(onnxruntime_providers_vitisai PRIVATE -Wno-unused-parameter)
endif(MSVC)
set_target_properties(onnxruntime_providers_vitisai PROPERTIES FOLDER "ONNXRuntime")
set_target_properties(onnxruntime_providers_vitisai PROPERTIES LINKER_LANGUAGE CXX)
- if (NOT onnxruntime_BUILD_SHARED_LIB)
- install(TARGETS onnxruntime_providers_vitisai
- ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
- LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
- RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
- FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
- endif()
+ install(TARGETS onnxruntime_providers_vitisai
+ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+ FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/cmake/onnxruntime_providers_xnnpack.cmake b/cmake/onnxruntime_providers_xnnpack.cmake
index 9c00703ca0846..796536ac9d12b 100644
--- a/cmake/onnxruntime_providers_xnnpack.cmake
+++ b/cmake/onnxruntime_providers_xnnpack.cmake
@@ -7,9 +7,6 @@
"${ONNXRUNTIME_INCLUDE_DIR}/core/providers/xnnpack/*.h"
"${ONNXRUNTIME_ROOT}/core/providers/xnnpack/*.h"
"${ONNXRUNTIME_ROOT}/core/providers/xnnpack/*.cc"
- # utils for handling QDQ models
- "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.h"
- "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.cc"
)
source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_providers_xnnpack_cc_srcs})
@@ -19,6 +16,12 @@
flatbuffers::flatbuffers Boost::mp11 safeint_interface
)
+ # TODO fix stringop-overflow warnings
+ # Add compile option to suppress stringop-overflow error in Flatbuffers.
+ if (HAS_STRINGOP_OVERFLOW)
+ target_compile_options(onnxruntime_providers_xnnpack PRIVATE -Wno-error=stringop-overflow)
+ endif()
+
add_dependencies(onnxruntime_providers_xnnpack onnx ${onnxruntime_EXTERNAL_DEPENDENCIES})
set_target_properties(onnxruntime_providers_xnnpack PROPERTIES FOLDER "ONNXRuntime")
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index 61922961588b2..b3669931d33dd 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -170,7 +170,6 @@ target_link_libraries(onnxruntime_pybind11_state PRIVATE
onnxruntime_session
${onnxruntime_libs}
${PROVIDERS_TVM}
- ${PROVIDERS_VITISAI}
${PROVIDERS_NNAPI}
${PROVIDERS_XNNPACK}
${PROVIDERS_COREML}
@@ -283,10 +282,7 @@ if (WIN32)
get_filename_component(CUDNN_DLL_NAME ${CUDNN_DLL_PATH} NAME_WE)
string(REPLACE "cudnn64_" "" CUDNN_VERSION "${CUDNN_DLL_NAME}")
if(NOT onnxruntime_CUDA_VERSION)
- message("Reading json file ${onnxruntime_CUDA_HOME}/version.json")
- set(CUDA_SDK_JSON_FILE_PATH "${onnxruntime_CUDA_HOME}/version.json")
- file(READ ${CUDA_SDK_JSON_FILE_PATH} CUDA_SDK_JSON_CONTENT)
- string(JSON onnxruntime_CUDA_VERSION GET ${CUDA_SDK_JSON_CONTENT} "cuda" "version")
+ set(onnxruntime_CUDA_VERSION ${CUDAToolkit_VERSION})
message("onnxruntime_CUDA_VERSION=${onnxruntime_CUDA_VERSION}")
endif()
file(APPEND "${VERSION_INFO_FILE}"
@@ -354,9 +350,6 @@ if (onnxruntime_ENABLE_TRAINING)
file(GLOB onnxruntime_python_optim_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/optim/*.py"
)
- file(GLOB onnxruntime_python_torchdynamo_srcs CONFIGURE_DEPENDS
- "${ORTTRAINING_SOURCE_DIR}/python/training/torchdynamo/*.py"
- )
file(GLOB onnxruntime_python_ortmodule_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/*.py"
)
@@ -387,6 +380,9 @@ if (onnxruntime_ENABLE_TRAINING)
file(GLOB onnxruntime_python_ortmodule_graph_optimizers_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/graph_optimizers/*"
)
+ file(GLOB onnxruntime_python_ortmodule_pipe_srcs CONFIGURE_DEPENDS
+ "${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/experimental/pipe/*"
+ )
file(GLOB onnxruntime_python_ort_triton_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ort_triton/*.py"
)
@@ -477,6 +473,9 @@ file(GLOB onnxruntime_python_transformers_models_llama_src CONFIGURE_DEPENDS
file(GLOB onnxruntime_python_transformers_models_longformer_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/tools/transformers/models/longformer/*.py"
)
+file(GLOB onnxruntime_python_transformers_models_phi2_src CONFIGURE_DEPENDS
+ "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/phi2/*.py"
+)
file(GLOB onnxruntime_python_transformers_models_stable_diffusion_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/tools/transformers/models/stable_diffusion/*.py"
)
@@ -547,6 +546,7 @@ add_custom_command(
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/gpt2
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/llama
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/longformer
+ COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/phi2
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/stable_diffusion
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/t5
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/whisper
@@ -650,6 +650,9 @@ add_custom_command(
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_transformers_models_longformer_src}
$/onnxruntime/transformers/models/longformer/
+ COMMAND ${CMAKE_COMMAND} -E copy
+ ${onnxruntime_python_transformers_models_phi2_src}
+ $/onnxruntime/transformers/models/phi2/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_transformers_models_stable_diffusion_src}
$/onnxruntime/transformers/models/stable_diffusion/
@@ -690,7 +693,7 @@ if (onnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS)
endif()
if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
- AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin|iOS"
+ AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin|iOS|visionOS"
AND NOT CMAKE_SYSTEM_NAME STREQUAL "Android"
AND NOT onnxruntime_USE_ROCM
AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
@@ -746,7 +749,6 @@ if (onnxruntime_ENABLE_TRAINING)
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/training/experimental
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/training/experimental/gradient_graph
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/training/optim
- COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/training/torchdynamo
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/training/ortmodule
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/training/ortmodule/experimental
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/training/ortmodule/experimental/json_config
@@ -757,6 +759,7 @@ if (onnxruntime_ENABLE_TRAINING)
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/training/ortmodule/torch_cpp_extensions/cuda/torch_gpu_allocator
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/training/ortmodule/torch_cpp_extensions/cuda/fused_ops
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/training/ortmodule/graph_optimizers
+ COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/training/ortmodule/experimental/pipe
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/training/ort_triton
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/training/ort_triton/kernel
COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/training/utils
@@ -777,9 +780,6 @@ if (onnxruntime_ENABLE_TRAINING)
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_optim_srcs}
$/onnxruntime/training/optim/
- COMMAND ${CMAKE_COMMAND} -E copy
- ${onnxruntime_python_torchdynamo_srcs}
- $/onnxruntime/training/torchdynamo/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_srcs}
$/onnxruntime/training/ortmodule/
@@ -810,6 +810,9 @@ if (onnxruntime_ENABLE_TRAINING)
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_graph_optimizers_srcs}
$/onnxruntime/training/ortmodule/graph_optimizers/
+ COMMAND ${CMAKE_COMMAND} -E copy
+ ${onnxruntime_python_ortmodule_pipe_srcs}
+ $/onnxruntime/training/ortmodule/experimental/pipe/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ort_triton_srcs}
$/onnxruntime/training/ort_triton/
@@ -859,6 +862,16 @@ if (onnxruntime_USE_DNNL)
)
endif()
+if (onnxruntime_USE_VITISAI)
+ add_custom_command(
+ TARGET onnxruntime_pybind11_state POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy
+ ${DNNL_DLL_PATH} $
+ $
+ $/onnxruntime/capi/
+ )
+endif()
+
if (onnxruntime_USE_TENSORRT)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
@@ -995,6 +1008,23 @@ if (onnxruntime_USE_COREML)
)
endif()
+if (onnxruntime_USE_QNN)
+ add_custom_command(
+ TARGET onnxruntime_pybind11_state POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy
+ ${QNN_LIB_FILES}
+ $/onnxruntime/capi/
+ )
+ if (EXISTS "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf")
+ add_custom_command(
+ TARGET onnxruntime_pybind11_state POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy
+ "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf"
+ $/onnxruntime/
+ )
+ endif()
+endif()
+
endif()
if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS)
include(onnxruntime_language_interop_ops.cmake)
diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
index f70961a66329a..bd472f08f68ba 100644
--- a/cmake/onnxruntime_rocm_hipify.cmake
+++ b/cmake/onnxruntime_rocm_hipify.cmake
@@ -20,10 +20,6 @@ set(contrib_ops_excluded_files
"bert/fastertransformer_decoder_attention/*"
"bert/multihead_attention.cc"
"bert/multihead_attention.h"
- "bert/fast_gelu_impl.cu"
- "bert/fast_gelu_impl.h"
- "bert/fast_gelu.cc"
- "bert/fast_gelu.h"
"bert/relative_attn_bias.cc"
"bert/relative_attn_bias.h"
"bert/relative_attn_bias_impl.cu"
@@ -44,28 +40,24 @@ set(contrib_ops_excluded_files
"bert/packed_multihead_attention.cc"
"bert/packed_multihead_attention_impl.h"
"bert/packed_multihead_attention_impl.cu"
- "diffusion/group_norm.cc"
"diffusion/group_norm_impl.cu"
- "diffusion/group_norm_impl.h"
"diffusion/nhwc_conv.cc"
"math/gemm_float8.cc"
"math/gemm_float8.cu"
"math/gemm_float8.h"
"moe/*"
+ "sparse/*"
"quantization/attention_quantization.cc"
"quantization/attention_quantization.h"
"quantization/attention_quantization_impl.cu"
"quantization/attention_quantization_impl.cuh"
- "quantization/dequantize_blockwise.cuh"
- "quantization/dequantize_blockwise.cu"
"quantization/dequantize_blockwise_bnb4.cuh"
"quantization/dequantize_blockwise_bnb4.cu"
"quantization/matmul_bnb4.cc"
"quantization/matmul_bnb4.cuh"
"quantization/matmul_bnb4.cu"
- "quantization/matmul_nbits.cc"
- "quantization/matmul_nbits.cuh"
- "quantization/matmul_nbits.cu"
+ "quantization/moe_quantization.h"
+ "quantization/moe_quantization.cc"
"quantization/quantize_dequantize_linear.cc"
"quantization/qordered_ops/qordered_attention_impl.cu"
"quantization/qordered_ops/qordered_attention_impl.h"
@@ -100,26 +92,18 @@ set(contrib_ops_excluded_files
"bert/group_query_attention.cc"
"bert/group_query_attention_impl.h"
"bert/group_query_attention_impl.cu"
+ "collective/distributed_*"
+ "collective/shard*"
)
-if (NOT onnxruntime_ENABLE_ATEN)
- list(APPEND contrib_ops_excluded_files "aten_ops/aten_op.cc")
-endif()
if (NOT onnxruntime_USE_NCCL)
# Those are string patterns to exclude. Do NOT use stars such as
# collective/*.cc or *.h.
list(APPEND contrib_ops_excluded_files "collective/nccl_kernels.cc")
- list(APPEND contrib_ops_excluded_files "collective/sharded_moe.h")
- list(APPEND contrib_ops_excluded_files "collective/sharded_moe.cc")
- list(APPEND contrib_ops_excluded_files "collective/sharding.cc")
- list(APPEND contrib_ops_excluded_files "collective/sharding_spec.cc")
- list(APPEND contrib_ops_excluded_files "collective/distributed_matmul.cc")
- list(APPEND contrib_ops_excluded_files "collective/distributed_slice.cc")
- list(APPEND contrib_ops_excluded_files "collective/distributed_reshape.cc")
- list(APPEND contrib_ops_excluded_files "collective/distributed_expand.cc")
- list(APPEND contrib_ops_excluded_files "collective/distributed_reduce.cc")
- list(APPEND contrib_ops_excluded_files "collective/distributed_unsqueeze.cc")
- list(APPEND contrib_ops_excluded_files "collective/distributed_squeeze.cc")
+endif()
+
+if (NOT onnxruntime_ENABLE_ATEN)
+ list(APPEND contrib_ops_excluded_files "aten_ops/aten_op.cc")
endif()
set(provider_excluded_files
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 7c8c70f913dca..6fb402be42165 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1,6 +1,6 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
-if (${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
+if (IOS)
find_package(XCTest REQUIRED)
endif()
@@ -18,7 +18,7 @@ function(AddTest)
cmake_parse_arguments(_UT "DYN" "TARGET" "LIBS;SOURCES;DEPENDS;TEST_ARGS" ${ARGN})
list(REMOVE_DUPLICATES _UT_SOURCES)
- if (${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
+ if (IOS)
onnxruntime_add_executable(${_UT_TARGET} ${TEST_SRC_DIR}/xctest/orttestmain.m)
else()
onnxruntime_add_executable(${_UT_TARGET} ${_UT_SOURCES})
@@ -67,7 +67,7 @@ function(AddTest)
if(onnxruntime_USE_CUDA)
#XXX: we should not need to do this. onnxruntime_test_all.exe should not have direct dependency on CUDA DLLs,
# otherwise it will impact when CUDA DLLs can be unloaded.
- target_link_libraries(${_UT_TARGET} PRIVATE cudart)
+ target_link_libraries(${_UT_TARGET} PRIVATE CUDA::cudart)
endif()
target_link_libraries(${_UT_TARGET} PRIVATE ${_UT_LIBS} GTest::gtest GTest::gmock ${onnxruntime_EXTERNAL_LIBRARIES})
endif()
@@ -111,7 +111,9 @@ function(AddTest)
target_compile_options(${_UT_TARGET} PRIVATE ${DISABLED_WARNINGS_FOR_TVM})
target_compile_options(${_UT_TARGET} PRIVATE "$<$:SHELL:--compiler-options -Wno-error=sign-compare>"
"$<$>:-Wno-error=sign-compare>")
- target_compile_options(${_UT_TARGET} PRIVATE "-Wno-error=uninitialized")
+ if (${HAS_NOERROR})
+ target_compile_options(${_UT_TARGET} PRIVATE "$<$:-Wno-error=uninitialized>")
+ endif()
endif()
set(TEST_ARGS ${_UT_TEST_ARGS})
@@ -127,7 +129,7 @@ function(AddTest)
endif()
endif(onnxruntime_GENERATE_TEST_REPORTS)
- if (${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
+ if (IOS)
# target_sources(${_UT_TARGET} PRIVATE ${TEST_SRC_DIR}/xctest/orttestmain.m)
set_target_properties(${_UT_TARGET} PROPERTIES FOLDER "ONNXRuntimeTest"
MACOSX_BUNDLE_BUNDLE_NAME ${_UT_TARGET}
@@ -248,11 +250,16 @@ file(GLOB onnxruntime_test_common_src CONFIGURE_DEPENDS
"${TEST_SRC_DIR}/common/logging/*.h"
)
-file(GLOB onnxruntime_test_quantiztion_src CONFIGURE_DEPENDS
+file(GLOB onnxruntime_test_quantization_src CONFIGURE_DEPENDS
"${TEST_SRC_DIR}/quantization/*.cc"
"${TEST_SRC_DIR}/quantization/*.h"
)
+file(GLOB onnxruntime_test_flatbuffers_src CONFIGURE_DEPENDS
+ "${TEST_SRC_DIR}/flatbuffers/*.cc"
+ "${TEST_SRC_DIR}/flatbuffers/*.h"
+)
+
if(NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_REDUCED_OPS_BUILD)
file(GLOB onnxruntime_test_ir_src CONFIGURE_DEPENDS
@@ -565,11 +572,7 @@ if(onnxruntime_USE_ROCM)
endif()
if(onnxruntime_USE_COREML)
- if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" OR CMAKE_SYSTEM_NAME STREQUAL "iOS")
- list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_coreml onnxruntime_coreml_proto)
- else()
- list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_coreml)
- endif()
+ list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_coreml coreml_proto)
endif()
if(onnxruntime_USE_ACL)
@@ -591,7 +594,6 @@ set(ONNXRUNTIME_TEST_LIBS
# CUDA, ROCM, TENSORRT, MIGRAPHX, DNNL, and OpenVINO are dynamically loaded at runtime
${PROVIDERS_NNAPI}
${PROVIDERS_JS}
- ${PROVIDERS_VITISAI}
${PROVIDERS_QNN}
${PROVIDERS_SNPE}
${PROVIDERS_RKNPU}
@@ -675,15 +677,9 @@ endif()
if(onnxruntime_USE_COREML)
list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/coreml/*)
- if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" OR CMAKE_SYSTEM_NAME STREQUAL "iOS")
- list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_coreml onnxruntime_coreml_proto)
- list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_coreml onnxruntime_coreml_proto)
- list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_coreml onnxruntime_coreml_proto)
- else()
- list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_coreml)
- list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_coreml)
- list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_coreml)
- endif()
+ list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_coreml coreml_proto)
+ list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_coreml coreml_proto)
+ list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_coreml coreml_proto)
endif()
if(onnxruntime_USE_XNNPACK)
@@ -743,37 +739,41 @@ target_include_directories(onnxruntime_test_utils PUBLIC "${TEST_SRC_DIR}/util/i
set_target_properties(onnxruntime_test_utils PROPERTIES FOLDER "ONNXRuntimeTest")
source_group(TREE ${TEST_SRC_DIR} FILES ${onnxruntime_test_utils_src})
-set(onnx_test_runner_src_dir ${TEST_SRC_DIR}/onnx)
-file(GLOB onnx_test_runner_common_srcs CONFIGURE_DEPENDS
- ${onnx_test_runner_src_dir}/*.h
- ${onnx_test_runner_src_dir}/*.cc)
+if(NOT IOS)
+ set(onnx_test_runner_src_dir ${TEST_SRC_DIR}/onnx)
+ file(GLOB onnx_test_runner_common_srcs CONFIGURE_DEPENDS
+ ${onnx_test_runner_src_dir}/*.h
+ ${onnx_test_runner_src_dir}/*.cc)
-list(REMOVE_ITEM onnx_test_runner_common_srcs ${onnx_test_runner_src_dir}/main.cc)
+ list(REMOVE_ITEM onnx_test_runner_common_srcs ${onnx_test_runner_src_dir}/main.cc)
-onnxruntime_add_static_library(onnx_test_runner_common ${onnx_test_runner_common_srcs})
-if(MSVC)
- target_compile_options(onnx_test_runner_common PRIVATE "$<$:SHELL:--compiler-options /utf-8>"
- "$<$>:/utf-8>")
-else()
- target_compile_definitions(onnx_test_runner_common PUBLIC -DNSYNC_ATOMIC_CPP11)
- target_include_directories(onnx_test_runner_common PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
- onnxruntime_add_include_to_target(onnx_test_runner_common nsync::nsync_cpp)
-endif()
-if (MSVC AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
- #TODO: fix the warnings, they are dangerous
- target_compile_options(onnx_test_runner_common PRIVATE "/wd4244")
-endif()
-onnxruntime_add_include_to_target(onnx_test_runner_common onnxruntime_common onnxruntime_framework
- onnxruntime_test_utils onnx onnx_proto re2::re2 flatbuffers::flatbuffers Boost::mp11 safeint_interface)
+ onnxruntime_add_static_library(onnx_test_runner_common ${onnx_test_runner_common_srcs})
+ if(MSVC)
+ target_compile_options(onnx_test_runner_common PRIVATE "$<$:SHELL:--compiler-options /utf-8>"
+ "$<$>:/utf-8>")
+ else()
+ target_compile_definitions(onnx_test_runner_common PUBLIC -DNSYNC_ATOMIC_CPP11)
+ target_include_directories(onnx_test_runner_common PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
+ onnxruntime_add_include_to_target(onnx_test_runner_common nsync::nsync_cpp)
+ endif()
+ if (MSVC AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+ #TODO: fix the warnings, they are dangerous
+ target_compile_options(onnx_test_runner_common PRIVATE "/wd4244")
+ endif()
+ onnxruntime_add_include_to_target(onnx_test_runner_common onnxruntime_common onnxruntime_framework
+ onnxruntime_test_utils onnx onnx_proto re2::re2 flatbuffers::flatbuffers Boost::mp11 safeint_interface)
-add_dependencies(onnx_test_runner_common onnx_test_data_proto ${onnxruntime_EXTERNAL_DEPENDENCIES})
-target_include_directories(onnx_test_runner_common PRIVATE ${eigen_INCLUDE_DIRS}
- ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
+ add_dependencies(onnx_test_runner_common onnx_test_data_proto ${onnxruntime_EXTERNAL_DEPENDENCIES})
+ target_include_directories(onnx_test_runner_common PRIVATE ${eigen_INCLUDE_DIRS}
+ ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
-set_target_properties(onnx_test_runner_common PROPERTIES FOLDER "ONNXRuntimeTest")
+ set_target_properties(onnx_test_runner_common PROPERTIES FOLDER "ONNXRuntimeTest")
+ set(onnx_test_runner_common_lib onnx_test_runner_common)
+endif()
set(all_tests ${onnxruntime_test_common_src} ${onnxruntime_test_ir_src} ${onnxruntime_test_optimizer_src}
- ${onnxruntime_test_framework_src} ${onnxruntime_test_providers_src} ${onnxruntime_test_quantiztion_src})
+ ${onnxruntime_test_framework_src} ${onnxruntime_test_providers_src} ${onnxruntime_test_quantization_src}
+ ${onnxruntime_test_flatbuffers_src})
if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
file(GLOB onnxruntime_test_providers_cuda_ut_src CONFIGURE_DEPENDS
@@ -783,7 +783,15 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_ut ${onnxruntime_test_providers_cuda_ut_src} $)
config_cuda_provider_shared_module(onnxruntime_providers_cuda_ut)
onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock)
+ target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey)
target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
+ if (MSVC)
+ # Cutlass code has an issue with the following:
+ # warning C4100: 'magic': unreferenced formal parameter
+ target_compile_options(onnxruntime_providers_cuda_ut PRIVATE "$<$:SHELL:--compiler-options /wd4100>"
+ "$<$>:/wd4100>")
+ endif()
+
list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_cuda_ut)
endif()
@@ -824,6 +832,17 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
"${TEST_SRC_DIR}/providers/memcpy_test.cc"
)
endif()
+ list(REMOVE_ITEM all_tests "${TEST_SRC_DIR}/providers/cpu/reduction/reduction_ops_test.cc"
+ "${TEST_SRC_DIR}/providers/cpu/tensor/grid_sample_test.cc")
+endif()
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten" OR IOS)
+ # Because we do not run these model tests in our web or iOS CI build pipelines, and some test code uses C++17
+ # filesystem functions that are not available in the iOS version we target.
+ message("Disable model tests in onnxruntime_test_all")
+ list(REMOVE_ITEM all_tests
+ "${TEST_SRC_DIR}/providers/cpu/model_tests.cc"
+ )
endif()
set(test_all_args)
@@ -843,7 +862,7 @@ AddTest(
TARGET onnxruntime_test_all
SOURCES ${all_tests} ${onnxruntime_unittest_main_src}
LIBS
- onnx_test_runner_common ${onnxruntime_test_providers_libs} ${onnxruntime_test_common_libs}
+ ${onnx_test_runner_common_lib} ${onnxruntime_test_providers_libs} ${onnxruntime_test_common_libs}
onnx_test_data_proto
DEPENDS ${all_dependencies}
TEST_ARGS ${test_all_args}
@@ -857,6 +876,11 @@ if (MSVC)
"$<$>:/wd26451>")
target_compile_options(onnxruntime_test_all PRIVATE "$<$:SHELL:--compiler-options /wd4244>"
"$<$>:/wd4244>")
+
+ # Avoid this compile error in graph_transform_test.cc:
+ # fatal error C1128: number of sections exceeded object file format limit: compile with /bigobj
+ set_property(SOURCE "${TEST_SRC_DIR}/optimizer/graph_transform_test.cc"
+ APPEND PROPERTY COMPILE_OPTIONS "/bigobj")
else()
target_compile_options(onnxruntime_test_all PRIVATE "-Wno-parentheses")
endif()
@@ -881,7 +905,7 @@ endif()
# the default logger tests conflict with the need to have an overall default logger
# so skip in this type of
target_compile_definitions(onnxruntime_test_all PUBLIC -DSKIP_DEFAULT_LOGGER_TESTS)
-if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
+if (IOS)
target_compile_definitions(onnxruntime_test_all_xc PUBLIC -DSKIP_DEFAULT_LOGGER_TESTS)
endif()
if(onnxruntime_RUN_MODELTEST_IN_DEBUG_MODE)
@@ -906,12 +930,14 @@ if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
endif()
if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
set_target_properties(onnxruntime_test_all PROPERTIES LINK_DEPENDS ${TEST_SRC_DIR}/wasm/onnxruntime_test_all_adapter.js)
- set_target_properties(onnxruntime_test_all PROPERTIES LINK_FLAGS "-s STACK_SIZE=5242880 -s ALLOW_MEMORY_GROWTH=1 -s MAXIMUM_MEMORY=4294967296 --pre-js \"${TEST_SRC_DIR}/wasm/onnxruntime_test_all_adapter.js\" -s \"EXPORTED_RUNTIME_METHODS=['FS']\" --preload-file ${CMAKE_CURRENT_BINARY_DIR}/testdata@/testdata -s EXIT_RUNTIME=1 -s DEMANGLE_SUPPORT=1")
+ set_target_properties(onnxruntime_test_all PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre.js)
+ set_target_properties(onnxruntime_test_all PROPERTIES LINK_FLAGS "-s STACK_SIZE=5242880 -s INITIAL_MEMORY=536870912 -s ALLOW_MEMORY_GROWTH=1 -s MAXIMUM_MEMORY=4294967296 -s INCOMING_MODULE_JS_API=[preRun,locateFile,arguments,onExit,wasmMemory,buffer,instantiateWasm] --pre-js \"${TEST_SRC_DIR}/wasm/onnxruntime_test_all_adapter.js\" --pre-js \"${ONNXRUNTIME_ROOT}/wasm/pre.js\" -s \"EXPORTED_RUNTIME_METHODS=['FS']\" --preload-file ${CMAKE_CURRENT_BINARY_DIR}/testdata@/testdata -s EXIT_RUNTIME=1")
if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
set_property(TARGET onnxruntime_test_all APPEND_STRING PROPERTY LINK_FLAGS " -s DEFAULT_PTHREAD_STACK_SIZE=131072 -s PROXY_TO_PTHREAD=1")
endif()
if (onnxruntime_USE_JSEP)
- set_property(TARGET onnxruntime_test_all APPEND_STRING PROPERTY LINK_FLAGS " --pre-js \"${ONNXRUNTIME_ROOT}/wasm/js_internal_api.js\"")
+ set_target_properties(onnxruntime_test_all PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js)
+ set_property(TARGET onnxruntime_test_all APPEND_STRING PROPERTY LINK_FLAGS " --pre-js \"${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js\"")
endif()
###
@@ -969,39 +995,17 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
endif()
if (onnxruntime_USE_QNN)
- if (NOT QNN_ARCH_ABI)
- string(TOLOWER ${onnxruntime_target_platform} GEN_PLATFORM)
- if(MSVC)
- message(STATUS "Building MSVC for architecture ${CMAKE_SYSTEM_PROCESSOR} with CMAKE_GENERATOR_PLATFORM as ${GEN_PLATFORM}")
- if (${GEN_PLATFORM} STREQUAL "arm64")
- set(QNN_ARCH_ABI aarch64-windows-msvc)
- else()
- set(QNN_ARCH_ABI x86_64-windows-msvc)
- endif()
- else()
- if (${CMAKE_SYSTEM_NAME} STREQUAL "Android")
- set(QNN_ARCH_ABI aarch64-android-clang6.0)
- elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
- if (${GEN_PLATFORM} STREQUAL "x86_64")
- set(QNN_ARCH_ABI x86_64-linux-clang)
- else()
- set(QNN_ARCH_ABI aarch64-android)
- endif()
- endif()
- endif()
- endif()
-
if (MSVC OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
- file(GLOB QNN_LIB_FILES LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/*.so" "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/*.dll")
- if (${QNN_ARCH_ABI} STREQUAL "aarch64-windows-msvc")
- file(GLOB EXTRA_HTP_LIB LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/hexagon-v68/unsigned/libQnnHtpV68Skel.so" "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so")
- list(APPEND QNN_LIB_FILES ${EXTRA_HTP_LIB})
- endif()
- message(STATUS "QNN lib files: " ${QNN_LIB_FILES})
- add_custom_command(
- TARGET ${test_data_target} POST_BUILD
- COMMAND ${CMAKE_COMMAND} -E copy ${QNN_LIB_FILES} $
- )
+ add_custom_command(
+ TARGET ${test_data_target} POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy ${QNN_LIB_FILES} $
+ )
+ endif()
+ if (EXISTS "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf")
+ add_custom_command(
+ TARGET ${test_data_target} POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf" $
+ )
endif()
endif()
@@ -1052,45 +1056,42 @@ if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS)
list(APPEND onnx_test_libs onnxruntime_language_interop onnxruntime_pyop)
endif()
-onnxruntime_add_executable(onnx_test_runner ${onnx_test_runner_src_dir}/main.cc)
-if(MSVC)
- target_compile_options(onnx_test_runner PRIVATE "$<$:SHELL:--compiler-options /utf-8>"
- "$<$>:/utf-8>")
-endif()
-if(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
- set_target_properties(onnx_test_runner PROPERTIES
- XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED "NO"
- )
-endif()
-if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
- if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
- set_target_properties(onnx_test_runner PROPERTIES LINK_FLAGS "-s NODERAWFS=1 -s ALLOW_MEMORY_GROWTH=1 -s PROXY_TO_PTHREAD=1 -s EXIT_RUNTIME=1")
- else()
- set_target_properties(onnx_test_runner PROPERTIES LINK_FLAGS "-s NODERAWFS=1 -s ALLOW_MEMORY_GROWTH=1")
- endif()
-endif()
+if (NOT IOS)
+ onnxruntime_add_executable(onnx_test_runner ${onnx_test_runner_src_dir}/main.cc)
+ if(MSVC)
+ target_compile_options(onnx_test_runner PRIVATE "$<$:SHELL:--compiler-options /utf-8>"
+ "$<$>:/utf-8>")
+ endif()
+ if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+ if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
+ set_target_properties(onnx_test_runner PROPERTIES LINK_FLAGS "-s NODERAWFS=1 -s ALLOW_MEMORY_GROWTH=1 -s PROXY_TO_PTHREAD=1 -s EXIT_RUNTIME=1")
+ else()
+ set_target_properties(onnx_test_runner PROPERTIES LINK_FLAGS "-s NODERAWFS=1 -s ALLOW_MEMORY_GROWTH=1")
+ endif()
+ endif()
-target_link_libraries(onnx_test_runner PRIVATE onnx_test_runner_common ${GETOPT_LIB_WIDE} ${onnx_test_libs} nlohmann_json::nlohmann_json)
-target_include_directories(onnx_test_runner PRIVATE ${ONNXRUNTIME_ROOT})
-if (onnxruntime_USE_ROCM)
- target_include_directories(onnx_test_runner PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
-endif()
-if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
- target_link_libraries(onnx_test_runner PRIVATE Python::Python)
-endif()
-set_target_properties(onnx_test_runner PROPERTIES FOLDER "ONNXRuntimeTest")
+ target_link_libraries(onnx_test_runner PRIVATE onnx_test_runner_common ${GETOPT_LIB_WIDE} ${onnx_test_libs} nlohmann_json::nlohmann_json)
+ target_include_directories(onnx_test_runner PRIVATE ${ONNXRUNTIME_ROOT})
+ if (onnxruntime_USE_ROCM)
+ target_include_directories(onnx_test_runner PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
+ endif()
+ if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
+ target_link_libraries(onnx_test_runner PRIVATE Python::Python)
+ endif()
+ set_target_properties(onnx_test_runner PROPERTIES FOLDER "ONNXRuntimeTest")
-if (onnxruntime_USE_TVM)
- if (WIN32)
- target_link_options(onnx_test_runner PRIVATE "/STACK:4000000")
- endif()
-endif()
+ if (onnxruntime_USE_TVM)
+ if (WIN32)
+ target_link_options(onnx_test_runner PRIVATE "/STACK:4000000")
+ endif()
+ endif()
-install(TARGETS onnx_test_runner
- ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
- LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
- BUNDLE DESTINATION ${CMAKE_INSTALL_LIBDIR}
- RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ install(TARGETS onnx_test_runner
+ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ BUNDLE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
if(onnxruntime_BUILD_BENCHMARKS)
@@ -1171,90 +1172,80 @@ endif()
if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
- #perf test runner
- set(onnxruntime_perf_test_src_dir ${TEST_SRC_DIR}/perftest)
- set(onnxruntime_perf_test_src_patterns
- "${onnxruntime_perf_test_src_dir}/*.cc"
- "${onnxruntime_perf_test_src_dir}/*.h")
+ if(NOT IOS)
+ #perf test runner
+ set(onnxruntime_perf_test_src_dir ${TEST_SRC_DIR}/perftest)
+ set(onnxruntime_perf_test_src_patterns
+ "${onnxruntime_perf_test_src_dir}/*.cc"
+ "${onnxruntime_perf_test_src_dir}/*.h")
- if(WIN32)
- list(APPEND onnxruntime_perf_test_src_patterns
- "${onnxruntime_perf_test_src_dir}/windows/*.cc"
- "${onnxruntime_perf_test_src_dir}/windows/*.h" )
- else ()
- list(APPEND onnxruntime_perf_test_src_patterns
- "${onnxruntime_perf_test_src_dir}/posix/*.cc"
- "${onnxruntime_perf_test_src_dir}/posix/*.h" )
- endif()
+ if(WIN32)
+ list(APPEND onnxruntime_perf_test_src_patterns
+ "${onnxruntime_perf_test_src_dir}/windows/*.cc"
+ "${onnxruntime_perf_test_src_dir}/windows/*.h" )
+ else ()
+ list(APPEND onnxruntime_perf_test_src_patterns
+ "${onnxruntime_perf_test_src_dir}/posix/*.cc"
+ "${onnxruntime_perf_test_src_dir}/posix/*.h" )
+ endif()
- file(GLOB onnxruntime_perf_test_src CONFIGURE_DEPENDS
- ${onnxruntime_perf_test_src_patterns}
- )
- onnxruntime_add_executable(onnxruntime_perf_test ${onnxruntime_perf_test_src} ${ONNXRUNTIME_ROOT}/core/platform/path_lib.cc)
- if(MSVC)
- target_compile_options(onnxruntime_perf_test PRIVATE "$<$:SHELL:--compiler-options /utf-8>"
+ file(GLOB onnxruntime_perf_test_src CONFIGURE_DEPENDS
+ ${onnxruntime_perf_test_src_patterns}
+ )
+ onnxruntime_add_executable(onnxruntime_perf_test ${onnxruntime_perf_test_src} ${ONNXRUNTIME_ROOT}/core/platform/path_lib.cc)
+ if(MSVC)
+ target_compile_options(onnxruntime_perf_test PRIVATE "$<$:SHELL:--compiler-options /utf-8>"
"$<$>:/utf-8>")
- endif()
- target_include_directories(onnxruntime_perf_test PRIVATE ${onnx_test_runner_src_dir} ${ONNXRUNTIME_ROOT}
+ endif()
+ target_include_directories(onnxruntime_perf_test PRIVATE ${onnx_test_runner_src_dir} ${ONNXRUNTIME_ROOT}
${eigen_INCLUDE_DIRS} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir}
${CMAKE_CURRENT_BINARY_DIR})
- if (onnxruntime_USE_ROCM)
- target_include_directories(onnxruntime_perf_test PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
- endif()
- if (WIN32)
- target_compile_options(onnxruntime_perf_test PRIVATE ${disabled_warnings})
- if (NOT DEFINED SYS_PATH_LIB)
- set(SYS_PATH_LIB shlwapi)
+ if (onnxruntime_USE_ROCM)
+ target_include_directories(onnxruntime_perf_test PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
+ endif()
+ if (WIN32)
+ target_compile_options(onnxruntime_perf_test PRIVATE ${disabled_warnings})
+ if (NOT DEFINED SYS_PATH_LIB)
+ set(SYS_PATH_LIB shlwapi)
+ endif()
endif()
- endif()
- if(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
- set_target_properties(onnxruntime_perf_test PROPERTIES
- XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED "NO"
- )
- endif()
- if (onnxruntime_BUILD_SHARED_LIB)
- #It will dynamically link to onnxruntime. So please don't add onxruntime_graph/onxruntime_framework/... here.
- #onnxruntime_common is kind of ok because it is thin, tiny and totally stateless.
- set(onnxruntime_perf_test_libs
+ if (onnxruntime_BUILD_SHARED_LIB)
+ #It will dynamically link to onnxruntime. So please don't add onxruntime_graph/onxruntime_framework/... here.
+ #onnxruntime_common is kind of ok because it is thin, tiny and totally stateless.
+ set(onnxruntime_perf_test_libs
onnx_test_runner_common onnxruntime_test_utils onnxruntime_common
onnxruntime onnxruntime_flatbuffers onnx_test_data_proto
${onnxruntime_EXTERNAL_LIBRARIES}
${GETOPT_LIB_WIDE} ${SYS_PATH_LIB} ${CMAKE_DL_LIBS})
- if(NOT WIN32)
- list(APPEND onnxruntime_perf_test_libs nsync::nsync_cpp)
- if(onnxruntime_USE_SNPE)
- list(APPEND onnxruntime_perf_test_libs onnxruntime_providers_snpe)
+ if(NOT WIN32)
+ list(APPEND onnxruntime_perf_test_libs nsync::nsync_cpp)
+ if(onnxruntime_USE_SNPE)
+ list(APPEND onnxruntime_perf_test_libs onnxruntime_providers_snpe)
+ endif()
endif()
+ if (CMAKE_SYSTEM_NAME STREQUAL "Android")
+ list(APPEND onnxruntime_perf_test_libs ${android_shared_libs})
+ endif()
+ target_link_libraries(onnxruntime_perf_test PRIVATE ${onnxruntime_perf_test_libs} Threads::Threads)
+ if(WIN32)
+ target_link_libraries(onnxruntime_perf_test PRIVATE debug dbghelp advapi32)
+ endif()
+ else()
+ target_link_libraries(onnxruntime_perf_test PRIVATE onnx_test_runner_common ${GETOPT_LIB_WIDE} ${onnx_test_libs})
endif()
- if (CMAKE_SYSTEM_NAME STREQUAL "Android")
- list(APPEND onnxruntime_perf_test_libs ${android_shared_libs})
- endif()
- target_link_libraries(onnxruntime_perf_test PRIVATE ${onnxruntime_perf_test_libs} Threads::Threads)
- if(WIN32)
- target_link_libraries(onnxruntime_perf_test PRIVATE debug dbghelp advapi32)
- endif()
- if(tensorflow_C_PACKAGE_PATH)
- target_include_directories(onnxruntime_perf_test PRIVATE ${tensorflow_C_PACKAGE_PATH}/include)
- target_link_directories(onnxruntime_perf_test PRIVATE ${tensorflow_C_PACKAGE_PATH}/lib)
- target_link_libraries(onnxruntime_perf_test PRIVATE tensorflow)
- target_compile_definitions(onnxruntime_perf_test PRIVATE HAVE_TENSORFLOW)
- endif()
- else()
- target_link_libraries(onnxruntime_perf_test PRIVATE onnx_test_runner_common ${GETOPT_LIB_WIDE} ${onnx_test_libs})
- endif()
- set_target_properties(onnxruntime_perf_test PROPERTIES FOLDER "ONNXRuntimeTest")
+ set_target_properties(onnxruntime_perf_test PROPERTIES FOLDER "ONNXRuntimeTest")
- if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS AND NOT onnxruntime_BUILD_SHARED_LIB)
- target_link_libraries(onnxruntime_perf_test PRIVATE onnxruntime_language_interop onnxruntime_pyop)
- endif()
+ if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS AND NOT onnxruntime_BUILD_SHARED_LIB)
+ target_link_libraries(onnxruntime_perf_test PRIVATE onnxruntime_language_interop onnxruntime_pyop)
+ endif()
- if (onnxruntime_USE_TVM)
- if (WIN32)
- target_link_options(onnxruntime_perf_test PRIVATE "/STACK:4000000")
+ if (onnxruntime_USE_TVM)
+ if (WIN32)
+ target_link_options(onnxruntime_perf_test PRIVATE "/STACK:4000000")
+ endif()
endif()
endif()
-
# shared lib
if (onnxruntime_BUILD_SHARED_LIB)
onnxruntime_add_static_library(onnxruntime_mocked_allocator ${TEST_SRC_DIR}/util/test_allocator.cc)
@@ -1275,11 +1266,17 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
list(APPEND onnxruntime_shared_lib_test_LIBS cpuinfo)
endif()
if (onnxruntime_USE_CUDA)
- list(APPEND onnxruntime_shared_lib_test_LIBS cudart)
+ list(APPEND onnxruntime_shared_lib_test_LIBS CUDA::cudart)
+ endif()
+ if (onnxruntime_USE_ROCM)
+ list(APPEND onnxruntime_shared_lib_test_LIBS hip::host)
endif()
if (onnxruntime_USE_TENSORRT)
list(APPEND onnxruntime_shared_lib_test_LIBS ${TENSORRT_LIBRARY_INFER})
endif()
+ if (onnxruntime_USE_DML)
+ list(APPEND onnxruntime_shared_lib_test_LIBS d3d12.lib)
+ endif()
if (CMAKE_SYSTEM_NAME STREQUAL "Android")
list(APPEND onnxruntime_shared_lib_test_LIBS ${android_shared_libs})
endif()
@@ -1294,6 +1291,10 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
target_include_directories(onnxruntime_shared_lib_test PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
target_sources(onnxruntime_shared_lib_test PRIVATE ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/cuda_ops.cu)
endif()
+ if (onnxruntime_USE_ROCM)
+ target_include_directories(onnxruntime_shared_lib_test PRIVATE ${onnxruntime_ROCM_HOME}/include)
+ target_compile_definitions(onnxruntime_shared_lib_test PRIVATE __HIP_PLATFORM_AMD__)
+ endif()
if (CMAKE_SYSTEM_NAME STREQUAL "Android")
target_sources(onnxruntime_shared_lib_test PRIVATE
"${ONNXRUNTIME_ROOT}/core/platform/android/cxa_demangle.cc"
@@ -1302,7 +1303,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
target_compile_definitions(onnxruntime_shared_lib_test PRIVATE USE_DUMMY_EXA_DEMANGLE=1)
endif()
- if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
+ if (IOS)
add_custom_command(
TARGET onnxruntime_shared_lib_test POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_directory
@@ -1389,7 +1390,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
target_compile_options(onnxruntime_mlas_test PRIVATE "$<$:SHELL:--compiler-options /wd26426>"
"$<$>:/wd26426>")
endif()
- if(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
+ if(IOS)
set_target_properties(onnxruntime_mlas_test PROPERTIES
XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED "NO"
)
@@ -1590,7 +1591,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
DEPENDS ${all_dependencies}
)
- if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
+ if (IOS)
add_custom_command(
TARGET onnxruntime_customopregistration_test POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_directory
@@ -1662,6 +1663,38 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND (NOT onnxruntime_MINIMAL_BUI
${ONNXRUNTIME_CUSTOM_OP_GET_CONST_INPUT_TEST_LIB_LINK_FLAG})
endif()
+if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND (NOT onnxruntime_MINIMAL_BUILD OR onnxruntime_MINIMAL_BUILD_CUSTOM_OPS))
+
+ file(GLOB_RECURSE custom_op_local_function_test_library_src
+ "${TEST_SRC_DIR}/testdata/custom_op_local_function/custom_op_local_function.cc"
+ "${TEST_SRC_DIR}/testdata/custom_op_local_function/custom_op_local_function.h"
+ "${TEST_SRC_DIR}/testdata/custom_op_local_function/dummy_gemm.cc"
+ "${TEST_SRC_DIR}/testdata/custom_op_local_function/dummy_gemm.h"
+ )
+
+ onnxruntime_add_shared_library_module(custom_op_local_function ${custom_op_local_function_test_library_src})
+
+ onnxruntime_add_include_to_target(custom_op_local_function onnxruntime_common GTest::gtest GTest::gmock)
+ target_include_directories(custom_op_local_function PRIVATE ${REPO_ROOT}/include/onnxruntime/core/session
+ ${REPO_ROOT}/include/onnxruntime/core/common)
+
+ if(UNIX)
+ if (APPLE)
+ set(ONNXRUNTIME_CUSTOM_OP_lOCAL_FUNCTION_TEST_LIB_LINK_FLAG "-Xlinker -dead_strip")
+ else()
+ string(CONCAT ONNXRUNTIME_CUSTOM_OP_lOCAL_FUNCTION_TEST_LIB_LINK_FLAG
+ "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_local_function/custom_op_local_function.lds "
+ "-Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
+ endif()
+ else()
+ set(ONNXRUNTIME_CUSTOM_OP_lOCAL_FUNCTION_TEST_LIB_LINK_FLAG
+ "-DEF:${TEST_SRC_DIR}/testdata/custom_op_local_function/custom_op_local_function.def")
+ endif()
+
+ set_property(TARGET custom_op_local_function APPEND_STRING PROPERTY LINK_FLAGS
+ ${ONNXRUNTIME_CUSTOM_OP_lOCAL_FUNCTION_TEST_LIB_LINK_FLAG})
+endif()
+
if (onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT onnxruntime_MINIMAL_BUILD)
set (onnxruntime_logging_apis_test_SRC
${ONNXRUNTIME_LOGGING_APIS_TEST_SRC_DIR}/test_logging_apis.cc)
@@ -1707,7 +1740,7 @@ endif()
# limit to only test on windows first, due to a runtime path issue on linux
if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
- AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin|iOS"
+ AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin|iOS|visionOS"
AND NOT CMAKE_SYSTEM_NAME STREQUAL "Android"
AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten"
AND NOT onnxruntime_USE_ROCM)
diff --git a/cmake/onnxruntime_visionos.toolchain.cmake b/cmake/onnxruntime_visionos.toolchain.cmake
new file mode 100644
index 0000000000000..7343fe43c74be
--- /dev/null
+++ b/cmake/onnxruntime_visionos.toolchain.cmake
@@ -0,0 +1,12 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+set(CMAKE_SYSTEM_NAME visionOS)
+set(CMAKE_SYSTEM_PROCESSOR arm64)
+
+if (NOT DEFINED CMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM AND NOT DEFINED CMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY)
+ set(CMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED NO)
+endif()
+
+SET(CMAKE_XCODE_ATTRIBUTE_CLANG_ENABLE_MODULES "YES")
+SET(CMAKE_XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC "YES")
diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index 9014089cb6112..7a49e90c00bce 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -225,9 +225,13 @@ else()
"SHELL:-s EXPORT_ALL=0"
"SHELL:-s VERBOSE=0"
"SHELL:-s FILESYSTEM=0"
+ "SHELL:-s INCOMING_MODULE_JS_API=[preRun,locateFile,arguments,onExit,wasmMemory,buffer,instantiateWasm,mainScriptUrlOrBlob]"
+ "SHELL:-s WASM_BIGINT=1"
${WASM_API_EXCEPTION_CATCHING}
--no-entry
+ "SHELL:--pre-js \"${ONNXRUNTIME_ROOT}/wasm/pre.js\""
)
+ set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre.js)
if (onnxruntime_USE_JSEP)
# NOTE: "-s ASYNCIFY=1" is required for JSEP to work with WebGPU
@@ -236,11 +240,11 @@ else()
target_compile_definitions(onnxruntime_webassembly PRIVATE USE_JSEP=1)
target_link_options(onnxruntime_webassembly PRIVATE
- --pre-js "${ONNXRUNTIME_ROOT}/wasm/js_internal_api.js"
+ "SHELL:--pre-js \"${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js\""
"SHELL:-s ASYNCIFY=1"
"SHELL:-s ASYNCIFY_STACK_SIZE=65536"
)
- set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/js_internal_api.js)
+ set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js)
endif()
if (onnxruntime_EMSCRIPTEN_SETTINGS)
@@ -251,23 +255,27 @@ else()
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
target_link_options(onnxruntime_webassembly PRIVATE
- "SHELL:-s ASSERTIONS=2"
+ # NOTE: use "SHELL:-s ASSERTIONS=2" to enable more strict assertions, which may help debugging segfaults.
+ # However, it may be very slow.
+ # "SHELL:-s ASSERTIONS=2"
+ "SHELL:-s ASSERTIONS=1"
"SHELL:-s SAFE_HEAP=1"
"SHELL:-s STACK_OVERFLOW_CHECK=2"
- "SHELL:-s DEMANGLE_SUPPORT=1"
)
else()
target_link_options(onnxruntime_webassembly PRIVATE
"SHELL:-s ASSERTIONS=0"
"SHELL:-s SAFE_HEAP=0"
"SHELL:-s STACK_OVERFLOW_CHECK=0"
- "SHELL:-s DEMANGLE_SUPPORT=0"
--closure 1
)
endif()
if (onnxruntime_USE_WEBNN)
- set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --bind -sWASM_BIGINT")
+ set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --bind")
+ if (onnxruntime_DISABLE_RTTI)
+ set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " -fno-rtti -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+ endif()
endif()
# Set link flag to enable exceptions support, this will override default disabling exception throwing behavior when disable exceptions.
@@ -281,6 +289,7 @@ else()
target_link_options(onnxruntime_webassembly PRIVATE
"SHELL:-s EXPORT_NAME=ortWasmThreaded"
"SHELL:-s DEFAULT_PTHREAD_STACK_SIZE=131072"
+ "SHELL:-s PTHREAD_POOL_SIZE=Module[\\\"numThreads\\\"]-1"
)
else()
target_link_options(onnxruntime_webassembly PRIVATE
@@ -306,5 +315,9 @@ else()
list(JOIN target_name_list "-" target_name)
- set_target_properties(onnxruntime_webassembly PROPERTIES OUTPUT_NAME ${target_name})
+ if (onnxruntime_USE_JSEP)
+ string(APPEND target_name ".jsep")
+ endif()
+
+ set_target_properties(onnxruntime_webassembly PROPERTIES OUTPUT_NAME ${target_name} SUFFIX ".mjs")
endif()
diff --git a/cmake/patches/abseil/absl_windows.patch b/cmake/patches/abseil/absl_windows.patch
index 66ef0c5125a74..584c49d612293 100644
--- a/cmake/patches/abseil/absl_windows.patch
+++ b/cmake/patches/abseil/absl_windows.patch
@@ -25,17 +25,91 @@ index a6efc98e..8c4de8e7 100644
"/wd4800",
]
diff --git a/absl/copts/copts.py b/absl/copts/copts.py
-index 0d6c1ec3..75fd935f 100644
+index e6e11949..0aa7d868 100644
--- a/absl/copts/copts.py
+++ b/absl/copts/copts.py
-@@ -132,10 +132,6 @@ COPT_VARS = {
- "/wd4068", # unknown pragma
- # qualifier applied to function type has no meaning; ignored
- "/wd4180",
-- # conversion from 'type1' to 'type2', possible loss of data
-- "/wd4244",
-- # conversion from 'size_t' to 'type', possible loss of data
-- "/wd4267",
- # The decorated name was longer than the compiler limit
- "/wd4503",
- # forcing value to bool 'true' or 'false' (performance warning)
+@@ -115,10 +115,6 @@ MSVC_WARNING_FLAGS = [
+ "/wd4068", # unknown pragma
+ # qualifier applied to function type has no meaning; ignored
+ "/wd4180",
+- # conversion from 'type1' to 'type2', possible loss of data
+- "/wd4244",
+- # conversion from 'size_t' to 'type', possible loss of data
+- "/wd4267",
+ # The decorated name was longer than the compiler limit
+ "/wd4503",
+ # forcing value to bool 'true' or 'false' (performance warning)
+diff --git a/absl/debugging/symbolize_win32.inc b/absl/debugging/symbolize_win32.inc
+index 53a099a1..34d210d6 100644
+--- a/absl/debugging/symbolize_win32.inc
++++ b/absl/debugging/symbolize_win32.inc
+@@ -35,15 +35,15 @@ ABSL_NAMESPACE_BEGIN
+
+ static HANDLE process = NULL;
+
+-void InitializeSymbolizer(const char*) {
+- if (process != nullptr) {
+- return;
+- }
++namespace {
++void InitializeSymbolizerImpl() {
++
+ process = GetCurrentProcess();
+
+ // Symbols are not loaded until a reference is made requiring the
+ // symbols be loaded. This is the fastest, most efficient way to use
+ // the symbol handler.
++
+ SymSetOptions(SYMOPT_DEFERRED_LOADS | SYMOPT_UNDNAME);
+ if (!SymInitialize(process, nullptr, true)) {
+ // GetLastError() returns a Win32 DWORD, but we assign to
+@@ -54,6 +54,36 @@ void InitializeSymbolizer(const char*) {
+ }
+ }
+
++bool LookupAndInitialize(const void* pc, SYMBOL_INFO* symbol) {
++ auto hProcess = (process != NULL) ? process : GetCurrentProcess();
++ if (SymFromAddr(hProcess, reinterpret_cast(pc), nullptr, symbol) != TRUE) {
++ if (GetLastError() == ERROR_INVALID_HANDLE && process == NULL) {
++ InitializeSymbolizerImpl();
++ if (SymFromAddr(process, reinterpret_cast(pc), nullptr, symbol) != TRUE) {
++ return false;
++ }
++ } else {
++ return false;
++ }
++ return false;
++ }
++ return true;
++}
++}
++
++void InitializeSymbolizer(const char*) {
++ if (process != nullptr) {
++ return;
++ }
++
++ alignas(SYMBOL_INFO) char buf[sizeof(SYMBOL_INFO) + MAX_SYM_NAME];
++ SYMBOL_INFO* symbol = reinterpret_cast(buf);
++ symbol->SizeOfStruct = sizeof(SYMBOL_INFO);
++ symbol->MaxNameLen = MAX_SYM_NAME;
++
++ static_cast(LookupAndInitialize(reinterpret_cast(&InitializeSymbolizer), symbol));
++}
++
+ bool Symbolize(const void* pc, char* out, int out_size) {
+ if (out_size <= 0) {
+ return false;
+@@ -62,9 +92,11 @@ bool Symbolize(const void* pc, char* out, int out_size) {
+ SYMBOL_INFO* symbol = reinterpret_cast(buf);
+ symbol->SizeOfStruct = sizeof(SYMBOL_INFO);
+ symbol->MaxNameLen = MAX_SYM_NAME;
+- if (!SymFromAddr(process, reinterpret_cast(pc), nullptr, symbol)) {
++
++ if(!LookupAndInitialize(pc, symbol)) {
+ return false;
+ }
++
+ const size_t out_size_t = static_cast(out_size);
+ strncpy(out, symbol->Name, out_size_t);
+ if (out[out_size_t - 1] != '\0') {
diff --git a/cmake/patches/coremltools/crossplatformbuild.patch b/cmake/patches/coremltools/crossplatformbuild.patch
new file mode 100644
index 0000000000000..7f2268f50c82e
--- /dev/null
+++ b/cmake/patches/coremltools/crossplatformbuild.patch
@@ -0,0 +1,155 @@
+diff --git a/mlmodel/src/MILBlob/Blob/FileWriter.cpp b/mlmodel/src/MILBlob/Blob/FileWriter.cpp
+index adc7bfcf..7b2bf9cc 100644
+--- a/mlmodel/src/MILBlob/Blob/FileWriter.cpp
++++ b/mlmodel/src/MILBlob/Blob/FileWriter.cpp
+@@ -8,8 +8,12 @@
+
+ #include
+ #include
++
++// ORT_EDIT: Exclude mmap on Windows. Not used in this file anyway.
++#if !defined(_WIN32)
+ #include
+ #include
++#endif
+
+ using namespace MILBlob;
+ using namespace MILBlob::Blob;
+diff --git a/mlmodel/src/MILBlob/Fp16.cpp b/mlmodel/src/MILBlob/Fp16.cpp
+index ae1e71a1..77a7161f 100644
+--- a/mlmodel/src/MILBlob/Fp16.cpp
++++ b/mlmodel/src/MILBlob/Fp16.cpp
+@@ -5,6 +5,8 @@
+
+ #include "MILBlob/Fp16.hpp"
+
++// ORT_EDIT: Exclude clang specific pragmas from other builds
++#if defined(__clang__)
+ // fp16 lib code has some conversion warnings we don't want to globally ignore
+ #pragma clang diagnostic push
+ #pragma clang diagnostic ignored "-Wincompatible-pointer-types"
+@@ -12,6 +14,9 @@
+ #pragma clang diagnostic ignored "-Wconversion"
+ #include "fp16/fp16.h"
+ #pragma clang diagnostic pop
++#else
++#include "fp16/fp16.h"
++#endif
+
+ using namespace MILBlob;
+
+diff --git a/modelpackage/src/ModelPackage.cpp b/modelpackage/src/ModelPackage.cpp
+index 8fee56b9..99e0d8d6 100644
+--- a/modelpackage/src/ModelPackage.cpp
++++ b/modelpackage/src/ModelPackage.cpp
+@@ -26,7 +26,14 @@ namespace std {
+ #else
+ #error "missing required header "
+ #endif
++
++// ORT_EDIT: Use UuidCreate on Windows.
++#if defined(_WIN32)
++#pragma comment(lib, "rpcrt4.lib") // UuidCreate
++#include
++#else
+ #include
++#endif
+ #include
+
+ #if defined(__cplusplus)
+@@ -187,7 +194,10 @@ public:
+ ModelPackageItemInfo createFile(const std::string& name, const std::string& author, const std::string& description);
+ };
+
++// ORT_EDIT: pragma only available on APPLE platforms
++#if defined(__APPLE__)
+ #pragma mark ModelPackageImpl
++#endif
+
+ ModelPackageImpl::ModelPackageImpl(const std::filesystem::path& path, bool createIfNecessary, bool readOnly)
+ : m_packagePath(path),
+@@ -372,6 +382,20 @@ std::filesystem::path ModelPackageImpl::getItemPath(const std::string& name, con
+ }
+
+ std::string ModelPackageImpl::generateIdentifier() const {
++// ORT_EDIT: Use built-in UUID generation on Windows
++#if defined(_WIN32)
++ UUID uuid;
++ UuidCreate(&uuid);
++
++ RPC_CSTR uuidStr;
++ UuidToStringA(&uuid, &uuidStr);
++
++ std::string uuidStrCpp(reinterpret_cast(uuidStr));
++
++ RpcStringFreeA(&uuidStr);
++
++ return uuidStrCpp;
++#else
+ uuid_t uuid;
+
+ // uuid_unparse generates a 36-character null-terminated string (37 bytes).
+@@ -383,6 +407,7 @@ std::string ModelPackageImpl::generateIdentifier() const {
+ uuid_unparse(uuid, buf);
+
+ return std::string(buf);
++#endif
+ }
+
+ ModelPackageItemInfo ModelPackageImpl::createFile(const std::string& name, const std::string& author, const std::string& description) {
+@@ -468,7 +493,13 @@ std::shared_ptr ModelPackageImpl::findItem(const std::stri
+ auto author = itemInfoEntry->getString(kModelPackageItemInfoAuthorKey);
+ auto description = itemInfoEntry->getString(kModelPackageItemInfoDescriptionKey);
+
++// ORT_EDIT: need to use path.string() on Windows
++#if defined(_WIN32)
++ return std::make_shared(std::make_shared(identifier, path.string(), name, author, description));
++
++#else
+ return std::make_shared(std::make_shared(identifier, path, name, author, description));
++#endif
+ }
+
+ std::shared_ptr ModelPackageImpl::findItem(const std::string& name, const std::string& author) const
+@@ -514,7 +545,9 @@ void ModelPackageImpl::removeItem(const std::string& identifier)
+ }
+
+ auto path = m_packageDataDirPath / itemInfoEntry->getString(kModelPackageItemInfoPathKey);
+- if (0 != std::remove(path.c_str())) {
++ // ORT_EDIT: std::remove doesn't work on Windows. Use std::filesystem::remove instead.
++ // if (0 != std::remove(path.c_str())) {
++ if (!std::filesystem::remove(path)) {
+ throw std::runtime_error("Failed to remove file at path: " + path.string());
+ }
+
+@@ -525,13 +558,16 @@ bool ModelPackageImpl::isValid(const std::filesystem::path& path)
+ {
+ try {
+ ModelPackageImpl(path, false, true);
+- } catch (std::runtime_error& e) {
++ } catch (std::runtime_error& /*e*/) { // ORT_EDIT: comment out unused variable
+ return false;
+ }
+ return true;
+ }
+
++// ORT_EDIT: pragma only available on APPLE platforms
++#if defined(__APPLE__)
+ #pragma mark ModelPackage
++#endif
+
+ ModelPackage::ModelPackage(const std::string& packagePath, bool createIfNecessary, bool readOnly)
+ : m_modelPackageImpl(std::make_shared(packagePath, createIfNecessary, readOnly))
+@@ -544,7 +580,12 @@ ModelPackage::~ModelPackage()
+
+ std::string ModelPackage::path() const
+ {
++// ORT_EDIT: Windows doesn't automatically convert to std::string as the native format could be char or wchar.
++#if defined(_WIN32)
++ return m_modelPackageImpl->path().string();
++#else
+ return m_modelPackageImpl->path();
++#endif
+ }
+
+ std::string ModelPackage::setRootModel(const std::string& path, const std::string& name, const std::string& author, const std::string& description)
diff --git a/cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch b/cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch
new file mode 100644
index 0000000000000..afb19a45ce0f4
--- /dev/null
+++ b/cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch
@@ -0,0 +1,22 @@
+diff --git a/include/cpuinfo.h b/include/cpuinfo.h
+index c46b65e..8b83a64 100644
+--- a/include/cpuinfo.h
++++ b/include/cpuinfo.h
+@@ -18,7 +18,7 @@
+ #define CPUINFO_ARCH_X86 1
+ #endif
+
+-#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
++#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)) || (defined(_M_AMD64) && !defined(_M_ARM64EC))
+ #define CPUINFO_ARCH_X86_64 1
+ #endif
+
+@@ -26,7 +26,7 @@
+ #define CPUINFO_ARCH_ARM 1
+ #endif
+
+-#if defined(__aarch64__) || defined(_M_ARM64)
++#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+ #define CPUINFO_ARCH_ARM64 1
+ #endif
+
diff --git a/cmake/patches/flatbuffers/flatbuffers.patch b/cmake/patches/flatbuffers/flatbuffers.patch
index fb2678ef1bdce..fbe8db37ecb0e 100644
--- a/cmake/patches/flatbuffers/flatbuffers.patch
+++ b/cmake/patches/flatbuffers/flatbuffers.patch
@@ -2,35 +2,11 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3987eac9..5e5462f1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-@@ -223,7 +223,7 @@ elseif(CMAKE_COMPILER_IS_GNUCXX)
- "${CMAKE_CXX_FLAGS} -std=c++0x")
- endif(CYGWIN)
- set(CMAKE_CXX_FLAGS
-- "${CMAKE_CXX_FLAGS} -Wall -pedantic -Werror -Wextra -Werror=shadow")
-+ "${CMAKE_CXX_FLAGS} -Wall -pedantic -Werror -Wextra -Werror=shadow -Wno-error=stringop-overflow")
- set(FLATBUFFERS_PRIVATE_CXX_FLAGS "-Wold-style-cast")
- if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.4)
- if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-diff --git a/src/idl_gen_rust.cpp b/src/idl_gen_rust.cpp
-index 55b8439b..dc03e8a8 100644
---- a/src/idl_gen_rust.cpp
-+++ b/src/idl_gen_rust.cpp
-@@ -406,7 +406,8 @@ class RustGenerator : public BaseGenerator {
- // example: f(A, D::E) -> super::D::E
- // does not include leaf object (typically a struct type).
-
-- size_t i = 0;
-+ // fix unused but set variable warning
-+ //size_t i = 0;
- std::stringstream stream;
-
- auto s = src->components.begin();
-@@ -417,7 +418,7 @@ class RustGenerator : public BaseGenerator {
- if (*s != *d) { break; }
- ++s;
- ++d;
-- ++i;
-+ //++i;
- }
-
- for (; s != src->components.end(); ++s) { stream << "super::"; }
+@@ -279,5 +279,5 @@
+ # Append FLATBUFFERS_CXX_FLAGS to CMAKE_CXX_FLAGS.
+ if(DEFINED FLATBUFFERS_CXX_FLAGS)
+ message(STATUS "extend CXX_FLAGS with ${FLATBUFFERS_CXX_FLAGS}")
+- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATBUFFERS_CXX_FLAGS}")
++ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATBUFFERS_CXX_FLAGS} -Wno-error=stringop-overflow")
+ endif()
+ message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
diff --git a/cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch b/cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch
new file mode 100644
index 0000000000000..e503a512a74ff
--- /dev/null
+++ b/cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch
@@ -0,0 +1,30 @@
+diff --git a/bestla/bestla/bestla_prologue_b.h b/bestla/bestla/bestla_prologue_b.h
+index 99f3ccc..a11de9d 100644
+--- a/bestla/bestla/bestla_prologue_b.h
++++ b/bestla/bestla/bestla_prologue_b.h
+@@ -456,9 +456,8 @@ class WeightKBlockNInteger {
+ auto tmpscales = tmp;
+ auto tmpzeropoints = reinterpret_cast(tmpscales + N * blks);
+ if (scales) {
+- for (size_t i = 0; i < N * blks; i += 2) {
++ for (size_t i = 0; i < N * blks; i ++) {
+ tmpscales[i] = scales[i] / 16;
+- tmpscales[i + 1] = scales[i + 1] / 16;
+ }
+ }
+ if (zero_points) {
+diff --git a/bestla/bestla/kernel_avx512f.h b/bestla/bestla/kernel_avx512f.h
+index 6783ee8..59822e5 100644
+--- a/bestla/bestla/kernel_avx512f.h
++++ b/bestla/bestla/kernel_avx512f.h
+@@ -673,8 +673,8 @@ inline BTLA_CODE decompress_kblock_s3_s8fp(utils::bit2x4* bit2ptr, utils::bit1x8
+ zmm1 = _mm512_sllv_epi32(zmm1, zmm_shift); // int3_clip => int8
+ zmm2 = _mm512_sllv_epi32(zmm2, zmm_shift); // int3_clip => int8
+
+- _mm512_storeu_epi8((__m512i*)dst, zmm1);
+- _mm512_storeu_epi8((__m512i*)(dst + 64), zmm2);
++ _mm512_storeu_si512((__m512i*)dst, zmm1);
++ _mm512_storeu_si512((__m512i*)(dst + 64), zmm2);
+ };
+
+ assert(head_ignore_num % 8 == 0);
diff --git a/cmake/patches/onnx/onnx.patch b/cmake/patches/onnx/onnx.patch
index a2d7672a3d48d..162d33581a5ca 100644
--- a/cmake/patches/onnx/onnx.patch
+++ b/cmake/patches/onnx/onnx.patch
@@ -1,8 +1,8 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 4dd56b6e..018da488 100644
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 6d7ca846..69aa622f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-@@ -397,6 +397,7 @@ if (MSVC)
+@@ -499,6 +499,7 @@ if (MSVC)
endif()
else()
# On non-Windows, hide all symbols we don't need
@@ -10,7 +10,7 @@ index 4dd56b6e..018da488 100644
set(ONNX_API_DEFINE "-DONNX_API=__attribute__\(\(__visibility__\(\"default\"\)\)\)")
set_target_properties(onnx_proto PROPERTIES CXX_VISIBILITY_PRESET hidden)
set_target_properties(onnx_proto PROPERTIES VISIBILITY_INLINES_HIDDEN 1)
-@@ -548,20 +549,9 @@ endif()
+@@ -653,20 +654,9 @@ endif()
if(MSVC)
target_compile_options(onnx_proto
PRIVATE /MP
@@ -31,8 +31,30 @@ index 4dd56b6e..018da488 100644
${EXTRA_FLAGS})
if(ONNX_USE_PROTOBUF_SHARED_LIBS)
target_compile_options(onnx_proto
+diff --git a/onnx/common/file_utils.h b/onnx/common/file_utils.h
+index b847798e..a6c31904 100644
+--- a/onnx/common/file_utils.h
++++ b/onnx/common/file_utils.h
+@@ -6,7 +6,6 @@
+
+ #pragma once
+
+-#include
+ #include
+ #include
+
+@@ -17,8 +16,7 @@ namespace ONNX_NAMESPACE {
+
+ template
+ void LoadProtoFromPath(const std::string proto_path, T& proto) {
+- std::filesystem::path proto_u8_path = std::filesystem::u8path(proto_path);
+- std::fstream proto_stream(proto_u8_path, std::ios::in | std::ios::binary);
++ std::fstream proto_stream(proto_path, std::ios::in | std::ios::binary);
+ if (!proto_stream.good()) {
+ fail_check("Unable to open proto file: ", proto_path, ". Please check if it is a valid proto. ");
+ }
diff --git a/onnx/onnx_pb.h b/onnx/onnx_pb.h
-index 0aab3e26..0f859267 100644
+index 0aab3e26..398ac2d6 100644
--- a/onnx/onnx_pb.h
+++ b/onnx/onnx_pb.h
@@ -47,10 +47,28 @@
diff --git a/cmake/patches/protobuf/protobuf_cmake.patch b/cmake/patches/protobuf/protobuf_cmake.patch
index fe8bd25ab147a..35ffc8a7f5ff9 100644
--- a/cmake/patches/protobuf/protobuf_cmake.patch
+++ b/cmake/patches/protobuf/protobuf_cmake.patch
@@ -29,3 +29,27 @@ index 04cb3303a..4025805cf 100644
# When building with "make", "lib" prefix will be added automatically by
# the build tool.
set(LIB_PREFIX)
+diff --git a/src/google/protobuf/map.h b/src/google/protobuf/map.h
+index 008c19225..cbab108c2 100644
+--- a/src/google/protobuf/map.h
++++ b/src/google/protobuf/map.h
+@@ -52,7 +52,8 @@
+ #endif // defined(__cpp_lib_string_view)
+
+ #if !defined(GOOGLE_PROTOBUF_NO_RDTSC) && defined(__APPLE__)
+-#include
++// apply update from https://github.com/protocolbuffers/protobuf/pull/15662/
++#include
+ #endif
+
+ #include
+@@ -1154,7 +1155,8 @@ class Map {
+ #if defined(__APPLE__)
+ // Use a commpage-based fast time function on Apple environments (MacOS,
+ // iOS, tvOS, watchOS, etc).
+- s += mach_absolute_time();
++ // apply update from https://github.com/protocolbuffers/protobuf/pull/15662/
++ s += clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
+ #elif defined(__x86_64__) && defined(__GNUC__)
+ uint32_t hi, lo;
+ asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
diff --git a/cmake/riscv64.toolchain.cmake b/cmake/riscv64.toolchain.cmake
new file mode 100644
index 0000000000000..0fda239f9a628
--- /dev/null
+++ b/cmake/riscv64.toolchain.cmake
@@ -0,0 +1,35 @@
+# Copyright (c) 2024 SiFive, Inc. All rights reserved.
+# Copyright (c) 2024, Phoebe Chen
+# Licensed under the MIT License.
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR riscv64)
+
+list(APPEND CMAKE_TRY_COMPILE_PLATFORM_VARIABLES RISCV_TOOLCHAIN_ROOT)
+
+if(NOT RISCV_TOOLCHAIN_ROOT)
+ message(FATAL_ERROR "RISCV_TOOLCHAIN_ROOT is not defined. Please set the RISCV_TOOLCHAIN_ROOT variable.")
+endif()
+
+set(CMAKE_C_COMPILER "${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-gcc")
+set(CMAKE_ASM_COMPILER "${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-gcc")
+set(CMAKE_CXX_COMPILER "${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-g++")
+
+set(CMAKE_FIND_ROOT_PATH ${RISCV_TOOLCHAIN_ROOT})
+set(CMAKE_SYSROOT "${RISCV_TOOLCHAIN_ROOT}/sysroot")
+set(CMAKE_INCLUDE_PATH "${RISCV_TOOLCHAIN_ROOT}/sysroot/usr/include/")
+set(CMAKE_LIBRARY_PATH "${RISCV_TOOLCHAIN_ROOT}/sysroot/usr/lib/")
+set(CMAKE_PROGRAM_PATH "${RISCV_TOOLCHAIN_ROOT}/sysroot/usr/bin/")
+
+if(RISCV_QEMU_PATH)
+ message(STATUS "RISCV_QEMU_PATH=${RISCV_QEMU_PATH} is defined during compilation.")
+ set(CMAKE_CROSSCOMPILING_EMULATOR "${RISCV_QEMU_PATH};-L;${CMAKE_SYSROOT}")
+endif()
+
+set(CMAKE_CROSSCOMPILING TRUE)
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+
diff --git a/cmake/wcos_rules_override.cmake b/cmake/wcos_rules_override.cmake
index f3d8093629a42..ec2303b073d5e 100644
--- a/cmake/wcos_rules_override.cmake
+++ b/cmake/wcos_rules_override.cmake
@@ -1,2 +1,2 @@
-set(CMAKE_C_STANDARD_LIBRARIES_INIT onecoreuap_apiset.lib)
-set(CMAKE_CXX_STANDARD_LIBRARIES_INIT onecoreuap_apiset.lib)
+set(CMAKE_C_STANDARD_LIBRARIES_INIT onecoreuap.lib)
+set(CMAKE_CXX_STANDARD_LIBRARIES_INIT onecoreuap.lib)
diff --git a/cmake/winml.cmake b/cmake/winml.cmake
index 268ee3960e75a..d74250b962628 100644
--- a/cmake/winml.cmake
+++ b/cmake/winml.cmake
@@ -836,6 +836,13 @@ if (winml_is_inbox)
target_include_directories(${new_target} PRIVATE ${include_directories})
target_link_libraries(${new_target} PRIVATE ${link_libraries})
target_link_options(${new_target} PRIVATE ${link_options})
+
+ # Attempt to copy linker flags
+ get_target_property(link_flags ${target} LINK_FLAGS)
+
+ if (NOT link_flags MATCHES ".*NOTFOUND")
+ set_property(TARGET ${new_target} PROPERTY LINK_FLAGS "${link_flags}")
+ endif()
endfunction()
if (WAI_ARCH STREQUAL x64 OR WAI_ARCH STREQUAL arm64)
diff --git a/csharp/ApiDocs/ApiDocs.csproj b/csharp/ApiDocs/ApiDocs.csproj
index 994e57913cf47..6081c444ba1af 100644
--- a/csharp/ApiDocs/ApiDocs.csproj
+++ b/csharp/ApiDocs/ApiDocs.csproj
@@ -7,7 +7,7 @@
-
+
all
runtime; build; native; contentfiles; analyzers; buildtransitive
diff --git a/csharp/OnnxRuntime.CSharp.proj b/csharp/OnnxRuntime.CSharp.proj
index 5e43756ced7b1..e09c865a8d163 100644
--- a/csharp/OnnxRuntime.CSharp.proj
+++ b/csharp/OnnxRuntime.CSharp.proj
@@ -20,6 +20,8 @@ CMake creates a target to this project
true
true
None
+ false
+ NativeNuget.nuspec
true
@@ -45,6 +47,17 @@ CMake creates a target to this project
python
+
+
+
+ $(BuildDate)
+ $(BuildTime)
+ $([System.DateTime]::UtcNow.ToString(yyyyMMdd))
+ $([System.DateTime]::UtcNow.ToString(hhmm))
+
+
+
+
@@ -58,7 +71,7 @@ CMake creates a target to this project
-
+
@@ -81,8 +94,6 @@ CMake creates a target to this project
- $([System.DateTime]::UtcNow.ToString(yyyyMMdd))
- $([System.DateTime]::UtcNow.ToString(hhmm))
@(MajorVersionNumber)
$(PackageVersion)$(ReleaseVersionSuffix)
$(PackageVersion)
@@ -91,23 +102,10 @@ CMake creates a target to this project
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
@@ -116,7 +114,7 @@ CMake creates a target to this project
@@ -144,6 +142,34 @@ CMake creates a target to this project
/>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/Microsoft.ML.OnnxRuntime.FasterRcnnSample.csproj b/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/Microsoft.ML.OnnxRuntime.FasterRcnnSample.csproj
index 3d35de1dfc6aa..5552a9eeb0d68 100644
--- a/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/Microsoft.ML.OnnxRuntime.FasterRcnnSample.csproj
+++ b/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/Microsoft.ML.OnnxRuntime.FasterRcnnSample.csproj
@@ -7,8 +7,8 @@
-
-
+
+
diff --git a/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj b/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj
index af8fa611a5010..647c0bbe6a242 100644
--- a/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj
+++ b/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj
@@ -7,8 +7,8 @@
-
-
+
+
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
index 1d15383239baf..e96e2d1535902 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
@@ -39,7 +39,7 @@
- $(MobileTargets);net6.0-android;net6.0-ios
+ $(MobileTargets);net6.0-android;net6.0-ios;net6.0-maccatalyst
true
true
- true
+ true
+ Condition="('$(PlatformTarget)' == 'x64' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' != 'true')) AND
+ Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime.dll')">
onnxruntime.dll
PreserveNewest
false
@@ -146,8 +147,8 @@
PreserveNewest
false
-
-
+
+
onnxruntime.dll
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Mobile/EndToEndTests.Mobile.Automation/EndToEndTests.Mobile.Automation.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Mobile/EndToEndTests.Mobile.Automation/EndToEndTests.Mobile.Automation.csproj
index b90929ad6d1c1..7bda34d266295 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Mobile/EndToEndTests.Mobile.Automation/EndToEndTests.Mobile.Automation.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Mobile/EndToEndTests.Mobile.Automation/EndToEndTests.Mobile.Automation.csproj
@@ -6,7 +6,7 @@
-
+
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj
index 1c9827c5bac62..5ff924bcf82f3 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj
@@ -37,10 +37,10 @@
-
+
-
+
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.bat b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.bat
index 07128fe1620ab..c16f12dc17f79 100755
--- a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.bat
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.bat
@@ -52,9 +52,12 @@ IF NOT errorlevel 0 (
%dn% list test\Microsoft.ML.OnnxRuntime.EndToEndTests\Microsoft.ML.OnnxRuntime.EndToEndTests.csproj package
dir test\Microsoft.ML.OnnxRuntime.EndToEndTests\packages\
-IF "%PACKAGENAME%"=="Microsoft.ML.OnnxRuntime.Gpu" (
+set gpu_package=F
+IF "%PACKAGENAME%"=="Microsoft.ML.OnnxRuntime.Gpu" set gpu_package=T
+IF "%PACKAGENAME%"=="Microsoft.ML.OnnxRuntime.Gpu.Windows" set gpu_package=T
+IF %%gpu_package%%=="T" (
set TESTONGPU=ON
- %dn% test -p:DefineConstants=USE_TENSORRT test\Microsoft.ML.OnnxRuntime.EndToEndTests\Microsoft.ML.OnnxRuntime.EndToEndTests.csproj --no-restore --filter TensorRT
+ %dn% test -p:DefineConstants=USE_TENSORRT test\Microsoft.ML.OnnxRuntime.EndToEndTests\Microsoft.ML.OnnxRuntime.EndToEndTests.csproj --no-restore --filter TensorRT
IF NOT errorlevel 0 (
@echo "Failed to build or execute the end-to-end test"
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh
index 39f0ff1c2f85e..a500e4bce8fbf 100755
--- a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh
@@ -32,7 +32,7 @@ if [ $RunTestCsharp = "true" ]; then
exit 1
fi
- if [ $PACKAGENAME = "Microsoft.ML.OnnxRuntime.Gpu" ]; then
+ if [ $PACKAGENAME = "Microsoft.ML.OnnxRuntime.Gpu" ] || [ $PACKAGENAME = "Microsoft.ML.OnnxRuntime.Gpu.Linux" ]; then
export TESTONGPU=ON
dotnet test -p:DefineConstants=USE_CUDA $BUILD_SOURCESDIRECTORY/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj --no-restore --verbosity detailed
if [ $? -ne 0 ]; then
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
index fd8feda359f90..d6a6b9627f418 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
@@ -55,6 +55,9 @@ public void TestSessionOptions()
Assert.Equal(0, opt.InterOpNumThreads);
Assert.Equal(GraphOptimizationLevel.ORT_ENABLE_ALL, opt.GraphOptimizationLevel);
+ // No get, so no verify
+ opt.DisablePerSessionThreads();
+
// try setting options
opt.ExecutionMode = ExecutionMode.ORT_PARALLEL;
Assert.Equal(ExecutionMode.ORT_PARALLEL, opt.ExecutionMode);
@@ -98,7 +101,7 @@ public void TestSessionOptions()
Assert.Contains("[ErrorCode:InvalidArgument] Config key is empty", ex.Message);
// SessionOptions.RegisterOrtExtensions can be manually tested by referencing the
- // Microsoft.ML.OnnxRuntime.Extensions nuget package. After that is done, this should not throw.
+ // Microsoft.ML.OnnxRuntime.Extensions nuget package. After that is done, this should not throw.
ex = Assert.Throws(() => { opt.RegisterOrtExtensions(); });
Assert.Contains("Microsoft.ML.OnnxRuntime.Extensions NuGet package must be referenced", ex.Message);
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Microsoft.ML.OnnxRuntime.Tests.Common.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Microsoft.ML.OnnxRuntime.Tests.Common.csproj
index ee81ab77432d1..ab27d62c3bf3b 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Microsoft.ML.OnnxRuntime.Tests.Common.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Microsoft.ML.OnnxRuntime.Tests.Common.csproj
@@ -119,8 +119,8 @@
-
-
+
+
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/TrainingTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/TrainingTest.cs
index 68b1d5bcc6147..9b72326201322 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/TrainingTest.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/TrainingTest.cs
@@ -612,6 +612,81 @@ public void TestUpdateParameter()
}
}
+ [Fact(DisplayName = "TestTrainingSessionTrainStepWithOrtValues")]
+ public void TestTrainingSessionTrainStepWithOrtValues()
+ {
+ string checkpointPath = Path.Combine(Directory.GetCurrentDirectory(), "checkpoint.ckpt");
+ using (var cleanUp = new DisposableListTest())
+ {
+ var state = CheckpointState.LoadCheckpoint(checkpointPath);
+ cleanUp.Add(state);
+ Assert.NotNull(state);
+ string trainingPath = Path.Combine(Directory.GetCurrentDirectory(), "training_model.onnx");
+ string optimizerPath = Path.Combine(Directory.GetCurrentDirectory(), "adamw.onnx");
+
+ var trainingSession = new TrainingSession(state, trainingPath, optimizerPath);
+ cleanUp.Add(trainingSession);
+
+ float[] expectedOutput = TestDataLoader.LoadTensorFromFile("loss_1.out");
+ var expectedOutputDimensions = new int[] { 1 };
+ float[] inputData = TestDataLoader.LoadTensorFromFile("input-0.in");
+ long[] inputShape = { 2, 784 };
+ Int32[] labelsData = { 1, 1 };
+ long[] labelsShape = { 2 };
+
+ using OrtValue inputOrtValue = OrtValue.CreateTensorValueFromMemory(inputData, inputShape);
+ using OrtValue labelsOrtValue = OrtValue.CreateTensorValueFromMemory(labelsData, labelsShape);
+ var inputValues = new List { inputOrtValue, labelsOrtValue };
+
+ using (var results = trainingSession.TrainStep(inputValues))
+ {
+ Assert.Single(results);
+ var outputOrtValue = results[0];
+ Assert.True(outputOrtValue.IsTensor);
+ var resultSpan = outputOrtValue.GetTensorDataAsSpan().ToArray();
+ Assert.Equal(expectedOutput, resultSpan, new FloatComparer());
+ }
+ }
+ }
+
+ [Fact(DisplayName = "TestTrainingSessionEvalStepWithOrtValues")]
+ public void TestTrainingSessionEvalStepWithOrtValues()
+ {
+ string checkpointPath = Path.Combine(Directory.GetCurrentDirectory(), "checkpoint.ckpt");
+ using (var cleanUp = new DisposableListTest())
+ {
+ var state = CheckpointState.LoadCheckpoint(checkpointPath);
+ cleanUp.Add(state);
+ Assert.NotNull(state);
+ string trainingPath = Path.Combine(Directory.GetCurrentDirectory(), "training_model.onnx");
+ string optimizerPath = Path.Combine(Directory.GetCurrentDirectory(), "adamw.onnx");
+ string evalPath = Path.Combine(Directory.GetCurrentDirectory(), "eval_model.onnx");
+
+ var trainingSession = new TrainingSession(state, trainingPath, evalPath, optimizerPath);
+ cleanUp.Add(trainingSession);
+
+ float[] expectedOutput = TestDataLoader.LoadTensorFromFile("loss_1.out");
+ var expectedOutputDimensions = new int[] { 1 };
+ float[] inputData = TestDataLoader.LoadTensorFromFile("input-0.in");
+ long[] inputShape = { 2, 784 };
+ Int32[] labelsData = { 1, 1 };
+ long[] labelsShape = { 2 };
+
+ using OrtValue inputOrtValue = OrtValue.CreateTensorValueFromMemory(inputData, inputShape);
+ using OrtValue labelsOrtValue = OrtValue.CreateTensorValueFromMemory(labelsData, labelsShape);
+ var inputValues = new List { inputOrtValue, labelsOrtValue };
+
+ using (var results = trainingSession.EvalStep(inputValues))
+ {
+ Assert.Single(results);
+ var outputOrtValue = results[0];
+ Assert.True(outputOrtValue.IsTensor);
+ var resultSpan = outputOrtValue.GetTensorDataAsSpan().ToArray();
+ Assert.Equal(expectedOutput, resultSpan, new FloatComparer());
+ }
+ }
+ }
+
internal class FloatComparer : IEqualityComparer
{
private float atol = 1e-3f;
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Devices/Microsoft.ML.OnnxRuntime.Tests.Devices.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Devices/Microsoft.ML.OnnxRuntime.Tests.Devices.csproj
index 37e83be5e33a1..40f6d453c6a90 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Devices/Microsoft.ML.OnnxRuntime.Tests.Devices.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Devices/Microsoft.ML.OnnxRuntime.Tests.Devices.csproj
@@ -11,6 +11,6 @@
-
+
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Droid/Microsoft.ML.OnnxRuntime.Tests.Droid.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Droid/Microsoft.ML.OnnxRuntime.Tests.Droid.csproj
index 11855032584a3..ef7e0825e919e 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Droid/Microsoft.ML.OnnxRuntime.Tests.Droid.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Droid/Microsoft.ML.OnnxRuntime.Tests.Droid.csproj
@@ -134,7 +134,7 @@
5.0.0.2083
-
+
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
index 715aed7e1d64f..7f3d5d6624b07 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
@@ -145,7 +145,7 @@ private void TestCUDAProviderOptions()
private void CanRunInferenceOnAModelWithTensorRT()
{
string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "squeezenet.onnx");
-
+
int deviceId = 0;
string deviceIdStr = System.Environment.GetEnvironmentVariable("ONNXRUNTIME_TEST_GPU_DEVICE_ID");
if (!string.IsNullOrEmpty(deviceIdStr) && int.TryParse(deviceIdStr, out int parsedValue) && parsedValue >= 0)
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.iOS/Microsoft.ML.OnnxRuntime.Tests.iOS.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.iOS/Microsoft.ML.OnnxRuntime.Tests.iOS.csproj
index 352de5db00920..56e65833724f6 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.iOS/Microsoft.ML.OnnxRuntime.Tests.iOS.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.iOS/Microsoft.ML.OnnxRuntime.Tests.iOS.csproj
@@ -99,7 +99,7 @@
2.4.1
-
+
5.0.0.2083
diff --git a/csharp/tools/MauiModelTester/MauiModelTester.csproj b/csharp/tools/MauiModelTester/MauiModelTester.csproj
index b0a17978328c0..39e688ce6c1b8 100644
--- a/csharp/tools/MauiModelTester/MauiModelTester.csproj
+++ b/csharp/tools/MauiModelTester/MauiModelTester.csproj
@@ -1,8 +1,8 @@
- net6.0-android;net6.0-ios
- $(TargetFrameworks);net6.0-windows10.0.19041.0
+ net8.0-ios;net8.0-android34.0
+ $(TargetFrameworks);net8.0-windows10.0.19041.0
Exe
MauiModelTester
true
@@ -21,7 +21,7 @@
1
12.0
- 21.0
+ 29.0
10.0.17763.0
10.0.17763.0
true
@@ -51,7 +51,7 @@
-
+
diff --git a/csharp/tools/MauiModelTester/Platforms/Android/AndroidManifest.xml b/csharp/tools/MauiModelTester/Platforms/Android/AndroidManifest.xml
index cc320dab474a0..2ef2296d7441f 100644
--- a/csharp/tools/MauiModelTester/Platforms/Android/AndroidManifest.xml
+++ b/csharp/tools/MauiModelTester/Platforms/Android/AndroidManifest.xml
@@ -4,5 +4,5 @@
-
+
\ No newline at end of file
diff --git a/csharp/tools/Microsoft.ML.OnnxRuntime.PerfTool/Microsoft.ML.OnnxRuntime.PerfTool.csproj b/csharp/tools/Microsoft.ML.OnnxRuntime.PerfTool/Microsoft.ML.OnnxRuntime.PerfTool.csproj
index 24f0d14ad9903..e0420a6ed0456 100644
--- a/csharp/tools/Microsoft.ML.OnnxRuntime.PerfTool/Microsoft.ML.OnnxRuntime.PerfTool.csproj
+++ b/csharp/tools/Microsoft.ML.OnnxRuntime.PerfTool/Microsoft.ML.OnnxRuntime.PerfTool.csproj
@@ -80,7 +80,7 @@
-
+
diff --git a/dockerfiles/Dockerfile.migraphx b/dockerfiles/Dockerfile.migraphx
index bc513a8e8ba6d..c3541a8bd3425 100644
--- a/dockerfiles/Dockerfile.migraphx
+++ b/dockerfiles/Dockerfile.migraphx
@@ -5,57 +5,22 @@
# Dockerfile to run ONNXRuntime with MIGraphX integration
#--------------------------------------------------------------------------
-FROM ubuntu:20.04
+FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1
ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
ARG ONNXRUNTIME_BRANCH=main
-ARG ROCM_VERSION=5.4
-# MIGraphX version should be the same as ROCm version
-ARG MIGRAPHX_VERSION=rocm-5.4.0
-ENV DEBIAN_FRONTEND noninteractive
-ENV MIGRAPHX_DISABLE_FAST_GELU=1
-RUN apt-get clean && apt-get update && apt-get install -y locales
-RUN locale-gen en_US.UTF-8
-RUN update-locale LANG=en_US.UTF-8
-ENV LC_ALL C.UTF-8
-ENV LANG C.UTF-8
+ENV PATH /code/cmake-3.27.3-linux-x86_64/bin:${PATH}
-# Install rocm
-RUN apt-get update && apt-get install -y gnupg2 --no-install-recommends curl && \
- curl -sL http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
- sh -c 'echo deb [arch=amd64] http://repo.radeon.com/rocm/apt/${ROCM_VERSION}/ ubuntu main > /etc/apt/sources.list.d/rocm.list'
-
-RUN apt-get update &&\
- apt-get install -y sudo git bash build-essential rocm-dev python3-dev python3-pip miopen-hip \
- rocblas half aria2 libnuma-dev pkg-config
-
-RUN aria2c -q -d /tmp -o cmake-3.27.3-linux-x86_64.tar.gz \
-https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-x86_64.tar.gz &&\
-tar -zxf /tmp/cmake-3.27.3-linux-x86_64.tar.gz --strip=1 -C /usr
-
-# Install rbuild
-RUN pip3 install https://github.com/RadeonOpenCompute/rbuild/archive/master.tar.gz numpy yapf==0.28.0
-
-ENV PATH /opt/miniconda/bin:/code/cmake-3.27.3-linux-x86_64/bin:${PATH}
-
-# Install MIGraphX from source
-RUN mkdir -p /migraphx
-RUN cd /migraphx && git clone --depth=1 --branch ${MIGRAPHX_VERSION} https://github.com/ROCmSoftwarePlatform/AMDMIGraphX src
-RUN cd /migraphx && rbuild package --cxx /opt/rocm/llvm/bin/clang++ -d /migraphx/deps -B /migraphx/build -S /migraphx/src/ -DPYTHON_EXECUTABLE=/usr/bin/python3
-RUN dpkg -i /migraphx/build/*.deb
-RUN rm -rf /migraphx
-
-# Install rocm ep dependencies
RUN apt-get update &&\
- apt-get install -y rocrand rccl hipsparse hipfft hipcub hipblas rocthrust
+ apt-get install -y migraphx
WORKDIR /code
# Prepare onnxruntime repository & build onnxruntime
RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
/bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh &&\
- cd onnxruntime &&\
+ cd onnxruntime && pip install --upgrade pip &&\
/bin/sh ./build.sh --allow_running_as_root --cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` --config Release --parallel \
--skip_tests --build_wheel --use_rocm --rocm_version=${ROCM_VERSION} --rocm_home /opt/rocm --use_migraphx &&\
pip install /code/onnxruntime/build/Linux/Release/dist/*.whl
diff --git a/dockerfiles/Dockerfile.openvino b/dockerfiles/Dockerfile.openvino
index 78d04a51ba162..75898770acf28 100644
--- a/dockerfiles/Dockerfile.openvino
+++ b/dockerfiles/Dockerfile.openvino
@@ -1,9 +1,9 @@
#-------------------------------------------------------------------------
-# Copyright(C) 2021-2023 Intel Corporation.
+# Copyright(C) 2021-2024 Intel Corporation.
# SPDX-License-Identifier: MIT
#--------------------------------------------------------------------------
-ARG OPENVINO_VERSION=2023.0.0
+ARG OPENVINO_VERSION=2024.0.0
# Build stage
@@ -13,11 +13,11 @@ ENV WORKDIR_PATH=/home/openvino
WORKDIR $WORKDIR_PATH
ENV DEBIAN_FRONTEND noninteractive
-ARG DEVICE=CPU_FP32
+ARG DEVICE=CPU
ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime.git
ARG ONNXRUNTIME_BRANCH=main
-ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
+ENV OpenVINO_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
USER root
RUN apt update; apt install -y git protobuf-compiler libprotobuf-dev
diff --git a/dockerfiles/Dockerfile.openvino-centos7 b/dockerfiles/Dockerfile.openvino-centos7
deleted file mode 100755
index 697db44801e3b..0000000000000
--- a/dockerfiles/Dockerfile.openvino-centos7
+++ /dev/null
@@ -1,105 +0,0 @@
-#-------------------------------------------------------------------------
-# Copyright(C) 2021 Intel Corporation.
-# SPDX-License-Identifier: MIT
-#--------------------------------------------------------------------------
-
-FROM centos:7.8.2003
-
-WORKDIR /code
-
-ARG MY_ROOT=/code
-ARG YUM_OV_PACKAGE=intel-openvino-runtime-centos7-2021.4.752.x86_64
-ARG DEVICE=CPU_FP32
-ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime
-ARG ONNXRUNTIME_BRANCH=main
-
-ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2021.4.752
-ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/share
-ENV IE_PLUGINS_PATH=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/lib/intel64
-ENV ngraph_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/ngraph/cmake
-ENV LD_LIBRARY_PATH=/opt/intel/opencl:${INTEL_OPENVINO_DIR}/inference_engine/external/gna/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/mkltiny_lnx/lib:$INTEL_OPENVINO_DIR/deployment_tools/ngraph/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/omp/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/tbb/lib:${IE_PLUGINS_PATH}:${LD_LIBRARY_PATH}
-ENV OpenCV_DIR=${INTEL_OPENVINO_DIR}/opencv/share/OpenCV
-ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/opencv/lib:${INTEL_OPENVINO_DIR}/opencv/share/OpenCV/3rdparty/lib:${LD_LIBRARY_PATH}
-ENV HDDL_INSTALL_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/hddl
-ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/hddl/lib:$LD_LIBRARY_PATH
-ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/usr/local/lib64:/usr/lib64:/lib64:$LD_LIBRARY_PATH
-
-# Install packages
-RUN yum update -y && \
- yum groupinstall "Development Tools" -y && \
- yum install -y yum-utils autoconf automake libtool unzip udev wget zlib-devel libffi-devel openssl-devel boost-devel-1.53.0 && \
- yum clean packages && yum clean all && rm -rf /var/cache/yum && \
-# Install cmake
- cd $MY_ROOT && \
- wget https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3.tar.gz && \
- tar -zxvf cmake-3.27.3.tar.gz && rm -rf cmake-3.27.3.tar.gz && \
- cd cmake-3.27.3 && \
- ./bootstrap && \
- make && \
- make install && \
- cd $MY_ROOT && \
-# libusb1.0.22
- cd /opt/ && wget https://github.com/libusb/libusb/archive/v1.0.22.zip && \
- unzip v1.0.22.zip && rm -rf v1.0.22.zip && cd /opt/libusb-1.0.22 && \
-# bootstrap steps
- ./bootstrap.sh && \
- ./configure --disable-udev --enable-shared && \
- make -j4 && \
-# configure libusb1.0.22
- cd /opt/libusb-1.0.22/libusb && \
- /bin/mkdir -p '/usr/local/lib' && \
- /bin/bash ../libtool --mode=install /usr/bin/install -c libusb-1.0.la '/usr/local/lib' && \
- /bin/mkdir -p '/usr/local/include/libusb-1.0' && \
- /usr/bin/install -c -m 644 libusb.h '/usr/local/include/libusb-1.0' && \
- /bin/mkdir -p '/usr/local/lib/pkgconfig' && \
-# Install openvino
- yum-config-manager --add-repo https://yum.repos.intel.com/openvino/2021/setup/intel-openvino-2021.repo && \
- rpm --import https://yum.repos.intel.com/openvino/2021/setup/RPM-GPG-KEY-INTEL-OPENVINO-2021 && \
- yum update -y && yum list intel-openvino* && \
- yum install -y $YUM_OV_PACKAGE && \
- cd ${INTEL_OPENVINO_DIR}/install_dependencies/ && ./install_openvino_dependencies.sh -y && \
- printf "\nexport LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:/usr/local/lib\n" >> /opt/intel/openvino_2021.4.752/bin/setupvars.sh && \
- cd /opt/libusb-1.0.22 && \
- /usr/bin/install -c -m 644 libusb-1.0.pc '/usr/local/lib/pkgconfig' && \
- cp /opt/intel/openvino_2021/deployment_tools/inference_engine/external/97-myriad-usbboot.rules /etc/udev/rules.d/ && \
- ldconfig && \
-# Install GPU runtime and drivers
- cd ${MY_ROOT} && \
- mkdir /tmp/opencl && \
- cd /tmp/opencl && \
- yum install -y epel-release && \
- yum install -y ocl-icd ocl-icd-devel && \
- wget -O intel-igc-core-1.0.2597-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-igc-core-1.0.2597-1.el7.x86_64.rpm/download && \
- wget -O intel-opencl-19.41.14441-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-opencl-19.41.14441-1.el7.x86_64.rpm/download && \
- wget -O intel-igc-opencl-devel-1.0.2597-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-igc-opencl-devel-1.0.2597-1.el7.x86_64.rpm/download && \
- wget -O intel-igc-opencl-1.0.2597-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-igc-opencl-1.0.2597-1.el7.x86_64.rpm/download && \
- wget -O intel-gmmlib-19.3.2-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-gmmlib-19.3.2-1.el7.x86_64.rpm/download && \
- wget -O intel-gmmlib-devel-19.3.2-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-gmmlib-devel-19.3.2-1.el7.x86_64.rpm/download && \
- rpm -i /tmp/opencl/*.rpm && \
- ldconfig && \
- rm -rf /tmp/opencl && \
-# Installing gcc-10
- yum install -y centos-release-scl && \
- yum install -y devtoolset-10-gcc* && \
- echo 'source scl_source enable devtoolset-10' >> ~/.bashrc && \
-# python installation
- source scl_source enable devtoolset-10 && \
- cd /code/ && \
- wget https://www.python.org/ftp/python/3.8.3/Python-3.8.3.tgz && tar xvf Python-3.8.3.tgz && \
- cd Python-3.8*/ && ./configure && make && make install && \
- cd ../ && mkdir -p /usr/bin/Python38 && ln -s Python-3.8.3/ /usr/bin/Python38 && \
-# installing dependancies
- yum install -y python3-lxml python3-six libusb.x86_64 && \
- yum clean packages && yum clean all && rm -rf /var/cache/yum && \
-# Build onnxruntime
- cd $MY_ROOT && \
- pip3 install numpy wheel setuptools cython && \
- git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO} && \
- pip3 install onnx && \
- cd /code/onnxruntime && ./build.sh --allow_running_as_root --config Release --update --build --parallel --use_openvino ${DEVICE} --build_shared_lib --build_wheel && \
- pip3 install /code/onnxruntime/build/Linux/Release/dist/*-linux_x86_64.whl && \
-# Clean up
- cd $MY_ROOT && rm -rf onnxruntime Python-3* && \
- cd ${MY_ROOT}/ && rm -rf cmake* && \
- cd /usr/share/ && rm -rf gcc* && cd /usr/lib/ && rm -rf gcc cd && rm -rf .cache && \
- cd ${INTEL_OPENVINO_DIR}/ && rm -rf documentation data_processing && cd deployment_tools/ && rm -rf tools
diff --git a/dockerfiles/Dockerfile.openvino-csharp b/dockerfiles/Dockerfile.openvino-csharp
deleted file mode 100644
index 2529ef4b73209..0000000000000
--- a/dockerfiles/Dockerfile.openvino-csharp
+++ /dev/null
@@ -1,90 +0,0 @@
-#-------------------------------------------------------------------------
-# Copyright(C) 2021-2023 Intel Corporation.
-# SPDX-License-Identifier: MIT
-#--------------------------------------------------------------------------
-
-ARG OPENVINO_VERSION=2023.0.0
-
-# Build stage
-FROM openvino/ubuntu20_runtime:${OPENVINO_VERSION} AS base
-
-ENV WORKDIR_PATH=/home/openvino
-WORKDIR $WORKDIR_PATH
-ENV DEBIAN_FRONTEND noninteractive
-
-USER root
-RUN apt update; apt install -y --no-install-recommends wget gnupg && \
- rm -rf /var/lib/apt/lists/*
-
-# Install Mono
-RUN wget http://download.mono-project.com/repo/xamarin.gpg && apt-key add xamarin.gpg && rm xamarin.gpg && \
- echo "deb https://download.mono-project.com/repo/ubuntu stable-bionic main" | tee /etc/apt/sources.list.d/mono-official-stable.list && \
- apt update -y && \
- apt install -y mono-devel
-
-# Install nuget.exe
-RUN wget https://dist.nuget.org/win-x86-commandline/latest/nuget.exe && \
- mv nuget.exe /usr/local/bin/nuget.exe && \
- echo 'mono /usr/local/bin/nuget.exe $@' > /usr/local/bin/nuget && \
- chmod a+x /usr/local/bin/nuget
-
-# Install .NET core
-RUN wget https://packages.microsoft.com/config/ubuntu/20.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb && \
- dpkg -i packages-microsoft-prod.deb && \
- apt-get update -y &&\
- apt-get install -y apt-transport-https && \
- apt-get update -y && \
- apt-get install -y dotnet-sdk-5.0
-
-# Build stage
-FROM base AS builder
-
-ENV WORKDIR_PATH=/home/openvino
-WORKDIR $WORKDIR_PATH
-ENV DEBIAN_FRONTEND noninteractive
-
-ARG DEVICE=CPU_FP32
-ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime.git
-ARG ONNXRUNTIME_BRANCH=main
-
-ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
-ENV LANG en_US.UTF-8
-
-USER root
-RUN apt update; apt install -y --no-install-recommends git protobuf-compiler libprotobuf-dev ca-certificates unattended-upgrades && \
- unattended-upgrade && \
- rm -rf /var/lib/apt/lists/*
-
-RUN git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO}
-RUN /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh
-RUN ln -s cmake-* cmake-dir
-RUN python3 -m pip install wheel
-ENV PATH=${WORKDIR_PATH}/cmake-dir/bin:$PATH
-RUN pip3 install onnx
-RUN ln -s /usr/bin/python3 /usr/bin/python
-RUN apt install locales && \
- locale-gen en_US en_US.UTF-8 && \
- dpkg-reconfigure locales
-RUN cd onnxruntime && ./build.sh --allow_running_as_root --config Release --update --build --parallel --use_openvino ${DEVICE} --build_nuget --build_shared_lib
-RUN cp /home/openvino/onnxruntime/build/Linux/Release/Microsoft.ML.OnnxRuntime.Managed* /home/openvino/onnxruntime/build/Linux/Release/nuget-artifacts
-
-# Deploy stage
-FROM base
-
-ENV DEBIAN_FRONTEND noninteractive
-USER root
-
-RUN apt update; apt install -y unattended-upgrades fonts-freefont-ttf && \
- unattended-upgrade
-ARG BUILD_UID=1001
-ARG BUILD_USER=onnxruntimedev
-RUN adduser --uid $BUILD_UID $BUILD_USER
-RUN usermod -a -G video,users ${BUILD_USER}
-ENV WORKDIR_PATH /home/${BUILD_USER}
-WORKDIR ${WORKDIR_PATH}
-COPY --from=builder /home/openvino/onnxruntime/build/Linux/Release/nuget-artifacts ${WORKDIR_PATH}/nuget-artifacts
-
-USER ${BUILD_USER}
-ENV PATH=${WORKDIR_PATH}/miniconda/bin:${WORKDIR_PATH}/cmake-dir/bin:$PATH
-ENV IE_PLUGINS_PATH=${INTEL_OPENVINO_DIR}/runtime/lib/intel64
-ENV LD_LIBRARY_PATH=/opt/intel/opencl:${INTEL_OPENVINO_DIR}/runtime/3rdparty/tbb/lib:${IE_PLUGINS_PATH}:${LD_LIBRARY_PATH}
diff --git a/dockerfiles/Dockerfile.openvino-rhel8 b/dockerfiles/Dockerfile.openvino-rhel8
deleted file mode 100644
index 5c504cfa553a1..0000000000000
--- a/dockerfiles/Dockerfile.openvino-rhel8
+++ /dev/null
@@ -1,87 +0,0 @@
-# Build stage
-FROM registry.access.redhat.com/ubi8/ubi:8.4
-
-WORKDIR /code
-
-ARG MY_ROOT=/code
-ARG DEVICE=CPU_FP32
-ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime
-ARG ONNXRUNTIME_BRANCH=main
-
-ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2022.3.0
-
-ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
-ENV IE_PLUGINS_PATH=${INTEL_OPENVINO_DIR}/runtime/lib/intel64/
-ENV ngraph_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
-ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/runtime/3rdparty/tbb/lib/:${IE_PLUGINS_PATH}:${LD_LIBRARY_PATH}
-ENV OpenCV_DIR=${INTEL_OPENVINO_DIR}/extras/opencv/cmake
-ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/extras/opencv/lib:${LD_LIBRARY_PATH}
-ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/usr/local/lib64:/usr/lib64:/lib64:${LD_LIBRARY_PATH}
-ENV PATH=${MY_ROOT}/cmake-dir/bin:$PATH
-
-# Install packages
-RUN yum install -y yum-utils autoconf automake libtool unzip udev wget zlib-devel libffi-devel openssl-devel git make gcc && \
- yum clean packages && yum clean all && rm -rf /var/cache/yum && \
-# Install python 3.8
- cd $MY_ROOT && \
- wget https://www.python.org/ftp/python/3.8.9/Python-3.8.9.tgz && tar xvf Python-3.8.9.tgz && rm -rf Python-3.8.9.tgz && \
- cd Python-3.8*/ && ./configure && make && make install && \
- cd ../ && mkdir -p /usr/bin/Python38 && ln -s Python-3.8.9/ /usr/bin/Python38 && ln -s /usr/bin/pip3 /usr/bin/pip && \
-# libusb1.0.22
- cd /opt/ && wget https://github.com/libusb/libusb/archive/v1.0.22.zip && \
- unzip v1.0.22.zip && rm -rf v1.0.22.zip && cd /opt/libusb-1.0.22 && \
-# bootstrap steps
- ./bootstrap.sh && \
- ./configure --disable-udev --enable-shared && \
- make -j4 && \
-# configure libusb1.0.22
- cd /opt/libusb-1.0.22/libusb && \
- /bin/mkdir -p '/usr/local/lib' && \
- /bin/bash ../libtool --mode=install /usr/bin/install -c libusb-1.0.la '/usr/local/lib' && \
- /bin/mkdir -p '/usr/local/include/libusb-1.0' && \
- /usr/bin/install -c -m 644 libusb.h '/usr/local/include/libusb-1.0' && \
- /bin/mkdir -p '/usr/local/lib/pkgconfig' && \
-# Install openvino
- cd /opt/ && mkdir intel/ && cd intel && \
- wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2022.3/linux/l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64.tgz && \
- tar xvf l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64.tgz && \
- rm -rf l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64.tgz && \
- mv l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64 openvino_2022.3.0 && \
- cd ${INTEL_OPENVINO_DIR}/install_dependencies/ && ./install_openvino_dependencies.sh -y && ./install_NEO_OCL_driver.sh -y && \
- printf "\nexport LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:/usr/local/lib\n" >> /opt/intel/openvino_2022.3.0/setupvars.sh && \
- cd /opt/libusb-1.0.22 && \
- /usr/bin/install -c -m 644 libusb-1.0.pc '/usr/local/lib/pkgconfig' && \
- # MYRIAD plugins are not available for openvino 2022.3.0 release
- #cp /opt/intel/openvino_2022.3.0/install_dependencies/97-myriad-usbboot.rules /etc/udev/rules.d/ && \
- ldconfig && \
-#Install protobuf
- cd $MY_ROOT && \
- git clone https://github.com/protocolbuffers/protobuf.git && \
- cd protobuf && \
- git checkout v3.16.0 && \
- git submodule update --init --recursive && \
- mkdir build_source && cd build_source && \
- cmake ../cmake -DCMAKE_INSTALL_LIBDIR=lib64 -Dprotobuf_BUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_INSTALL_SYSCONFDIR=/etc -DCMAKE_POSITION_INDEPENDENT_CODE=ON -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=Release && \
- make -j$(nproc) && \
- make install && \
-# Build onnxruntime
- cd $MY_ROOT && \
- pip3 install numpy wheel setuptools cython onnx && \
- git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO} && \
- bash onnxruntime/dockerfiles/scripts/install_common_deps.sh && \
- ln -s cmake-* cmake-dir && \
- source /opt/intel/openvino_2022.3.0/setupvars.sh && \
- cd /code/onnxruntime && ./build.sh --allow_running_as_root --config Release --update --build --parallel --use_openvino ${DEVICE} --build_shared_lib --build_wheel && \
- pip3 install /code/onnxruntime/build/Linux/Release/dist/*-linux_x86_64.whl && \
-# Clean up
- cd ${MY_ROOT} && rm -rf onnxruntime && rm -rf Python-3.8.9 && rm -rf protobuf
-
-# Deploy stage
-ARG BUILD_UID=1001
-ARG BUILD_USER=onnxruntimedev
-RUN adduser --uid $BUILD_UID $BUILD_USER
-RUN usermod -a -G video,users,render ${BUILD_USER}
-ENV WORKDIR_PATH /home/${BUILD_USER}
-
-WORKDIR ${WORKDIR_PATH}
-USER ${BUILD_USER}
diff --git a/dockerfiles/Dockerfile.rocm b/dockerfiles/Dockerfile.rocm
index 35a676383337b..c242933f677f0 100644
--- a/dockerfiles/Dockerfile.rocm
+++ b/dockerfiles/Dockerfile.rocm
@@ -5,14 +5,14 @@
# Dockerfile to run ONNXRuntime with ROCm integration
#--------------------------------------------------------------------------
-FROM rocm/pytorch:rocm5.4_ubuntu20.04_py3.7_pytorch_1.12.1
+FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1
ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
ARG ONNXRUNTIME_BRANCH=main
WORKDIR /code
-ENV PATH /opt/miniconda/bin:/code/cmake-3.27.3-linux-x86_64/bin:${PATH}
+ENV PATH /code/cmake-3.27.3-linux-x86_64/bin:${PATH}
# Prepare onnxruntime repository & build onnxruntime
RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
diff --git a/dockerfiles/README.md b/dockerfiles/README.md
index f226ebfe8b193..a2e99d66d4654 100644
--- a/dockerfiles/README.md
+++ b/dockerfiles/README.md
@@ -277,7 +277,7 @@ Nothing else from ONNX Runtime source tree will be copied/installed to the image
Note: When running the container you built in Docker, please either use 'nvidia-docker' command instead of 'docker', or use Docker command-line options to make sure NVIDIA runtime will be used and appropiate files mounted from host. Otherwise, CUDA libraries won't be found. You can also [set NVIDIA runtime as default in Docker](https://github.com/dusty-nv/jetson-containers#docker-default-runtime).
## MIGraphX
-**Ubuntu 20.04, ROCm5.4, AMDMIGraphX v1.2**
+**Ubuntu 20.04, ROCm6.0, MIGraphX**
1. Build the docker image from the Dockerfile in this repository.
```
@@ -291,7 +291,7 @@ Note: When running the container you built in Docker, please either use 'nvidia-
```
## ROCm
-**Ubuntu 20.04, ROCm5.4**
+**Ubuntu 20.04, ROCm6.0**
1. Build the docker image from the Dockerfile in this repository.
```
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index e5b43ddba8cc7..4d7493bd69650 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -41,6 +41,7 @@ Do not modify directly.*
* com.microsoft.Gelu
* com.microsoft.GemmFastGelu
* com.microsoft.GemmFloat8
+ * com.microsoft.GemmaRotaryEmbedding
* com.microsoft.GreedySearch
* com.microsoft.GridSample
* com.microsoft.GroupNorm
@@ -78,6 +79,7 @@ Do not modify directly.*
* com.microsoft.QLinearSigmoid
* com.microsoft.QLinearSoftmax
* com.microsoft.QLinearWhere
+ * com.microsoft.QMoE
* com.microsoft.QOrderedAttention
* com.microsoft.QOrderedGelu
* com.microsoft.QOrderedLayerNormalization
@@ -100,6 +102,7 @@ Do not modify directly.*
* com.microsoft.SkipLayerNormalization
* com.microsoft.SkipSimplifiedLayerNormalization
* com.microsoft.Snpe
+ * com.microsoft.SparseAttention
* com.microsoft.SparseToDenseMatMul
* com.microsoft.Tokenizer
* com.microsoft.TorchEmbedding
@@ -155,6 +158,8 @@ This version of the operator has been available since version 1 of the 'com.micr
Corresponding past and present are same tensor, its size is (2, batch_size, num_heads, max_sequence_length, head_size)
qkv_hidden_sizes : list of ints
Hidden dimension of Q, K, V: hidden_size, hidden_size and v_hidden_size
+rotary_embedding_dim : int
+Dimension of rotary embedding. Limited to 32, 64 or 128. Default value is head_size
scale : float
Custom scale will be used if specified. Default value is 1/sqrt(head_size)
unidirectional : int
@@ -459,7 +464,7 @@ This version of the operator has been available since version 1 of the 'com.micr
repetition_penalty (optional) : T
The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)
vocab_mask (optional) : M
-Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)
+Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)
prefix_vocab_mask (optional) : M
Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)
attention_mask (optional) : I
@@ -1368,7 +1373,7 @@ This version of the operator has been available since version 1 of the 'com.micr
#### Type Constraints
-T1 : tensor(int8), tensor(uint8), tensor(int16), tensor(uint16), tensor(int32)
+T1 : tensor(int8), tensor(uint8), tensor(int16), tensor(uint16), tensor(int32), tensor(int4), tensor(uint4)
Constrain 'x' and 'x_zero_point' to 8-bit integer tensors, 16-bit integer tensors, or 32-bit signed integer tensors.
T2 : tensor(float16), tensor(float)
Constrain 'y', 'x_scale' to float tensors.
@@ -1586,10 +1591,14 @@ This version of the operator has been available since version 1 of the 'com.micr
payload of the execution provider context if embed_mode=1, or path to the context file if embed_mode=0.
ep_sdk_version : string
(Optional) SDK version used to convert the model.
+hardware_architecture : string
+(Optional) Hardware architecture.
main_context : int
Usually each single EPContext associate with a graph partition.But for some case like QNN, it has single EPContext contains all partitions.In that case, the node with ep_cache_context should set main_context=1. Other nodes set main_context=0 and skip ep_cache_context.The path is relative to this Onnx file. Default is 1.
notes : string
(Optional) Some notes for the model
+onnx_model_filename : string
+(Optional) Filename of the original ONNX model.
partition_name : string
(Optional) partitioned graph name.
source : string
@@ -2205,6 +2214,69 @@ This version of the operator has been available since version 1 of the 'com.micr
+### **com.microsoft.GemmaRotaryEmbedding**
+
+ GemmaRotaryEmbedding is the implementation of below part of rotary positional embeddings (RoPE). It implements below from modeling_gemma.py.
+
+ Here's onnxscript that was tested
+
+ from onnxscript import FLOAT, FLOAT16, script
+ from onnxscript import opset18 as op
+
+ @script()
+ def gemma_rotary_embedding(emb: FLOAT["bs", "seq_len", "dim"], q: FLOAT16["bs", "num_heads", "seq_len", "dim"], q_rot: FLOAT16["bs", "num_heads", "seq_len", "dim"], k: FLOAT16["bs", "num_heads", "seq_len", "dim"], k_rot: FLOAT16["bs", "num_heads", "seq_len", "dim"]):
+ sin_val = op.Sin(emb)
+ casted_sin = op.Cast(sin_val, to=10) # for fp16 mix-precision training. Other types are not supported.
+ cos_val = op.Cos(emb)
+ casted_cos = op.Cast(cos_val, to=10)
+ unsqueezed_sin = op.Unsqueeze(casted_sin, [1])
+ unsqueezed_cos = op.Unsqueeze(casted_cos, [1])
+ q_embed = (q * casted_cos) + (q_rot * casted_sin)
+ k_embed = (k * casted_cos) + (k_rot * casted_sin)
+ return q_embed, k_embed
+
+ onnx_model = gemma_rotary_embedding.to_model_proto()
+
+
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+#### Inputs
+
+
+emb : U
+embeddding - 3D tensor with shape (batch_size, seq_len, dim)
+q : T
+q state - 4D tensor with shape (batch_size, num_heads, seq_len, dim)
+q_rot : T
+half rotated q state - 4D tensor with shape (batch_size, num_heads, seq_len, dim)
+k : T
+k state - 4D tensor with shape (batch_size, num_heads, seq_len, dim)
+k_rot : T
+k state - 4D tensor with shape (batch_size, num_heads, seq_len, dim)
+
+
+#### Outputs
+
+
+output1 : T
+4D tensor with shape (batch_size, num_heads, seq_len, dim)
+output2 : T
+4D tensor with shape (batch_size, num_heads, seq_len, dim)
+
+
+#### Type Constraints
+
+
+T : tensor(float16)
+Constrain input and output types to float16 tensors.
+U : tensor(float)
+Constrain input 0 type to float tensors
+
+
+
### **com.microsoft.GreedySearch**
Greedy Search for text generation.
@@ -2248,7 +2320,7 @@ This version of the operator has been available since version 1 of the 'com.micr
repetition_penalty (optional) : T
The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)
vocab_mask (optional) : I
-Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)
+Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)
prefix_vocab_mask (optional) : I
Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)
attention_mask (optional) : I
@@ -2385,7 +2457,11 @@ This version of the operator has been available since version 1 of the 'com.micr
Group Query Self/Cross Attention.
- Supports different number of heads for q and kv. Only supports causal or local attention.
+ *Highly recommend using k-v cache share buffer for both CPU and CUDA. Enabled through IOBinding past and present kv.
+ Supports different number of heads for q and kv for CPU and CUDA.
+ Only supports causal and local attention.
+ Supports rotary position embedding for CPU and CUDA.
+ Supports packed input for CPU and CUDA.
#### Version
@@ -2394,24 +2470,28 @@ This version of the operator has been available since version 1 of the 'com.micr
#### Attributes
+do_rotary : int
+Whether to use rotary position embedding. Default value is 0.
kv_num_heads : int (required)
Number of attention heads for k and v
local_window_size : int
left_window_size for local attention (like Mistral). Default value is -1 meaning unused.
num_heads : int (required)
Number of attention heads for q
+rotary_interleaved : int
+Rotate using interleaved pattern. Default value is 0 (False).
scale : float
Custom scale will be used if specified. Default value is 1/sqrt(head_size)
-#### Inputs
+#### Inputs (7 - 9)
query : T
-Query with shape (batch_size, sequence_length, hidden_size)
-key : T
+Query with shape (batch_size, sequence_length, hidden_size), or packed QKV with shape(batch_size, sequence_length, d) where d is (num_heads * head_size + 2 * kv_num_heads * head_size).
+key (optional) : T
Key with shape (batch_size, kv_sequence_length, kv_hidden_size)
-value : T
+value (optional) : T
Value with shape (batch_size, kv_sequence_length, kv_hidden_size)
past_key (optional) : T
past state key with support for format BNSH. When past_key uses same tensor as present_key(k-v cache), it is of length max_sequence_length... otherwise of length past_sequence_length.
@@ -2421,6 +2501,10 @@ This version of the operator has been available since version 1 of the 'com.micr
1d Tensor of shape (batch_size). Indicates past sequence lengths for token generation case.
total_sequence_length : M
Scalar tensor of total sequence length (past + new).
+cos_cache (optional) : T
+2D tensor with shape (max_sequence_length, head_size / 2).
+sin_cache (optional) : T
+2D tensor with shape (max_sequence_length, head_size / 2).
#### Outputs
@@ -2437,7 +2521,7 @@ This version of the operator has been available since version 1 of the 'com.micr
#### Type Constraints
-T : tensor(float16)
+T : tensor(float16), tensor(bfloat16), tensor(float)
Constrain input and output to float tensors.
M : tensor(int32)
Constrain mask to int tensor.
@@ -2783,7 +2867,7 @@ This version of the operator has been available since version 1 of the 'com.micr
Constrain input A data type to 8-bit integer tensor.
T2 : tensor(int8), tensor(uint8)
Constrain input B data type to 8-bit integer tensor.
-T3 : tensor(float)
+T3 : tensor(float), tensor(float16)
Constrain input a_scale, b_scale and output Y data type as float tensor.
@@ -2796,22 +2880,23 @@ This version of the operator has been available since version 1 of the 'com.micr
And block_size is not an arbitrary number and must be a power of 2 and not smaller than 16, like 16, 32, 64, 128,..
3. Input B's scale and zero point are specified by input scales and zero_points.
- Input B is stored as uint8_t with shape: [N][n_blocks_per_col][blob_size] in which:
- - n_blocks_per_col = (K + block_size - 1) / block_size
- - blob_size = block_size / 8 * bits
+ Input B is stored as uint8_t with shape: [N][n_blocks_per_col][blob_size] in which:
+ - n_blocks_per_col = (K + block_size - 1) / block_size
+ - blob_size = CeilDiv(block_size * bits, bitsof(uint8_t)<8>)
+ For all bits from 2-8, a row of data is stored squeezely and represented by uint8_t.
+ - for 2,4,8 bits, 4x2bit,2x4bit,1x8bit are stored in one uint8_t.
+ 4bit example:
+ |.|.|.|.| .|.|.|.| =uint8_t (2x4bit)
+ - for 3,5,6,7 bits, 32x3bit,32x5bit,16x6bit,32x7bit are stored in 12xuint8_t,20xuint8_t,12xuint8_t,28xuint8_t separately. no bits are wasted.
+ 3bit example:
+ |.|.|. |.|.|. |.|.|. = 9bit, which across 2 uint8_t, the highest bit for the second uint8_t is used.
+ The last uint_8 may have some bits unused.
- For a block blob. It is stored in format:
- struct Blob {
- uint8 one_bits[(bits & 0x1) * 1 * block_size / 8]; // highest 1 bit for 3, 5, 7 bits quantization
- uint8 two_bits[(bits & 0x2) * 2 * block_size / 8]; // high 2 bits for 2, 6, 7 bits quantization
- uint8 four_bits[(bits & 0x4) * 4 * block_size / 8]; // low 4 bits for 4, 5, 6 bits quantization
- }
Input scales is stored in same type as original type of B(float32, float16) with shape like: [N * n_blocks_per_col]
- Input zero_points is stored as uint8_t. If bits <= 4, two zero points are stored as one unit8_t. If bits > 4, one zero point is stored with one unit8_t. Thus, its shape is:
- - [(N * n_blocks_per_col + 1) / 2] if bits <=4
- - [N * n_blocks_per_col] if bits > 4
-
+ Input zero_points is stored as uint8_t or same as type(A). It has the same packing method as input B.
+ - [CeilDiv((N * n_blocks_per_col + 1) *bits, 8)]
+ If zero_points has same type as A, it's not packed and has the same shape as Scales.
#### Version
@@ -2824,23 +2909,29 @@ This version of the operator has been available since version 1 of the 'com.micr
size of each input feature
N : int (required)
size of each output feature
+accuracy_level : int
+The minimum accuracy level of input A, can be: 0(unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8) (default unset). It is used to control how input A is quantized or downcast internally while doing computation, for example: 0 means input A will not be quantized or downcast while doing computation. 4 means input A can be quantized with the same block_size to int8 internally from type T1.
bits : int (required)
number of bits used for weight quantization (default 4)
block_size : int (required)
number of groupsize used for weight quantization,(default 128). It needs to be a power of 2 and not smaller than 16.
-#### Inputs (3 - 4)
+#### Inputs (3 - 6)
A : T1
The input tensor, not quantized
B : T2
-1-dimensional data blob
+1 or 2 dimensional data blob
scales : T1
quantization scale
-zero_points (optional) : T2
+zero_points (optional) : T3
quantization zero points
+g_idx (optional) : T4
+group_idx
+bias (optional) : T1
+Bias to add to result. It should have shape [N].
#### Outputs
@@ -2855,8 +2946,12 @@ This version of the operator has been available since version 1 of the 'com.micr
T1 : tensor(float), tensor(float16)
Constrain input and output types to float/half_float tensors.
-T2 : tensor(uint8)
-Constrain quantized weight types to uint8.
+T2 : tensor(uint8), tensor(int32)
+Constrain quantized weight types to uint8/int32.
+T3 : tensor(uint8), tensor(int32), tensor(float16), tensor(float)
+Constrain quantized zero point types to uint8/int32/float16/float.
+T4 : tensor(int32)
+the index tensor.
@@ -2910,8 +3005,8 @@ This version of the operator has been available since version 1 of the 'com.micr
### **com.microsoft.MoE**
Mixture of experts. Examples: Switch transformer(https://arxiv.org/pdf/2101.03961.pdf) use top 1,
- GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, and Vision MOE(https://arxiv.org/pdf/2106.05974.pdf)
- usually uses top 32 experts.
+ GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, Vision MOE(https://arxiv.org/pdf/2106.05974.pdf)
+ usually uses top 32 experts and Mixtral(https://huggingface.co/blog/mixtral).
#### Version
@@ -2925,9 +3020,11 @@ This version of the operator has been available since version 1 of the 'com.micr
Activation function to use. Choose from relu, gelu, silu and identity. Default is relu
k : int
Number of top experts to select from expert pool
+normalize_routing_weights : int
+Whether to normalize routing weights
-#### Inputs (4 - 6)
+#### Inputs (5 - 8)
input : T
@@ -2936,12 +3033,16 @@ This version of the operator has been available since version 1 of the 'com.micr
2D input tensor with shape (num_rows, num_experts)
fc1_experts_weights : T
3D input tensor with shape (num_experts, hidden_size, inter_size)
-fc2_experts_weights : T
-3D input tensor with shape (num_experts, inter_size, hidden_size)
fc1_experts_bias (optional) : T
2D optional input tensor with shape (num_experts, inter_size)
+fc2_experts_weights : T
+3D input tensor with shape (num_experts, inter_size, hidden_size)
fc2_experts_bias (optional) : T
2D optional input tensor with shape (num_experts, hidden_size)
+fc3_experts_weights (optional) : T
+3D optional input tensor with shape (num_experts, hidden_size, inter_size)
+fc3_experts_bias (optional) : T
+2D optional input tensor with shape (num_experts, inter_size)
#### Outputs
@@ -3025,6 +3126,8 @@ This version of the operator has been available since version 1 of the 'com.micr
Number of attention heads
scale : float
Custom scale will be used if specified. Default value is 1/sqrt(head_size)
+unidirectional : int
+Whether every token can only attend to previous tokens. Default value is 0.
#### Inputs (1 - 8)
@@ -3321,7 +3424,7 @@ This version of the operator has been available since version 1 of the 'com.micr
Input tensors contains the hidden embedding of real tokens.
Token_offset records the offset of token in the unpacked input.
- cumulated_token_count records cumulated length of each sequnces length.
+ cumulated_token_count records cumulated length of each sequence length.
The operator only supports BERT like model with padding on right now.
@@ -3395,7 +3498,7 @@ This version of the operator has been available since version 1 of the 'com.micr
The query, key and value tensors contain result of hidden embedding of real tokens after input projections.
Token_offset records the offset of token in the unpacked input.
- cumulative_sequence_length records cumulated length of each sequnces length.
+ cumulative_sequence_length records cumulated length of each sequence length.
The operator only supports BERT like model with padding on right now.
@@ -4232,6 +4335,69 @@ This version of the operator has been available since version 1 of the 'com.micr
+### **com.microsoft.QMoE**
+
+ Int4 MoE
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+#### Attributes
+
+
+activation_type : string
+Activation function to use. Choose from relu, gelu, silu and identity. Default is relu
+k : int
+Number of top experts to select from expert pool
+normalize_routing_weights : int
+Whether to normalize routing weights
+
+
+#### Inputs (7 - 11)
+
+
+input : T
+2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)
+router_probs : T
+2D input tensor with shape (num_rows, num_experts)
+fc1_experts_weights : T1
+3D input tensor with shape (num_experts, hidden_size, inter_size / 2)
+fc1_scales : T
+2D input tensor with shape (num_experts, inter_size)
+fc1_experts_bias (optional) : T
+2D optional input tensor with shape (num_experts, inter_size)
+fc2_experts_weights : T1
+3D input tensor with shape (num_experts, inter_size, hidden_size / 2)
+fc2_scales : T
+2D input tensor with shape (num_experts, hidden_size)
+fc2_experts_bias (optional) : T
+2D optional input tensor with shape (num_experts, hidden_size)
+fc3_experts_weights (optional) : T1
+3D optional input tensor with shape (num_experts, hidden_size, inter_size / 2)
+fc3_scales (optional) : T
+2D optional input tensor with shape (num_experts, inter_size)
+fc3_experts_bias (optional) : T
+2D optional input tensor with shape (num_experts, inter_size)
+
+
+#### Outputs
+
+
+output : T
+2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)
+
+
+#### Type Constraints
+
+
+T : tensor(float16)
+Constrain input and output types to float or float16 tensors.
+T1 : tensor(uint8)
+Constrain weights type to uint8 tensors.
+
+
+
### **com.microsoft.QOrderedAttention**
Quantized version of simplified Multi-Head Self Attention(using int8 with specific matrix Layout).
@@ -4666,7 +4832,7 @@ This version of the operator has been available since version 1 of the 'com.micr
T1 : tensor(float16), tensor(float)
Constrain 'x', 'y_scale' to float tensors.
-T2 : tensor(int8), tensor(uint8), tensor(int16), tensor(uint16)
+T2 : tensor(int8), tensor(uint8), tensor(int16), tensor(uint16), tensor(int4), tensor(uint4)
Constrain 'y_zero_point' and 'y' to 8-bit and 16-bit integer tensors.
@@ -5015,6 +5181,10 @@ This version of the operator has been available since version 1 of the 'com.micr
interleaved : int
Rotate using interleaved pattern. Default value is 0 (False).
+num_heads : int
+Number of attention heads. Default value is 0. Must use with rotary_embedding_dim
+rotary_embedding_dim : int
+Rotary embedding dimension. Default value is 0.
scale : float
Custom scale will be used if specified. Default value is 1.0
@@ -5027,9 +5197,9 @@ This version of the operator has been available since version 1 of the 'com.micr
position_ids : M
1D tensor with shape (1) or 2D tensor with shape (batch_size, sequence_length)
cos_cache : T
-2D tensor with shape (max_sequence_length, head_size / 2).
+2D tensor with shape (max_sequence_length, head_size / 2) or (max_sequence_length, rotary_embedding_dim / 2)
sin_cache : T
-2D tensor with shape (max_sequence_length, head_size / 2).
+2D tensor with shape (max_sequence_length, head_size / 2) or (max_sequence_length, rotary_embedding_dim / 2)
#### Outputs
@@ -5042,7 +5212,7 @@ This version of the operator has been available since version 1 of the 'com.micr
#### Type Constraints
-T : tensor(float), tensor(float16)
+T : tensor(float), tensor(float16), tensor(bfloat16)
Constrain input and output types to float tensors.
M : tensor(int64)
Constrain input and output types to integer tensors
@@ -5134,7 +5304,7 @@ This version of the operator has been available since version 1 of the 'com.micr
repetition_penalty (optional) : T
The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)
vocab_mask (optional) : I
-Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)
+Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)
prefix_vocab_mask (optional) : I
Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)
attention_mask (optional) : I
@@ -5377,6 +5547,110 @@ This version of the operator has been available since version 1 of the 'com.micr
+### **com.microsoft.SparseAttention**
+
+ Block Sparse Attention used in Phi-3-small (https://arxiv.org/pdf/2404.14219).
+
+ It is inspired by Sparse Transformers (https://arxiv.org/pdf/1904.10509) and BigBird (https://arxiv.org/pdf/2007.14062).
+
+ block_mask can be used to configure sparse layout for different head.
+ When number of sparse layout is 1, all heads have same sparse layout. Otherwise, different layouts are used cyclically.
+ For example, given 4 layouts (S0, S1, S2, S3), 8 heads will have layouts like (S0, S1, S2, S3, S0, S1, S2, S3).
+
+ The block_row_indices and block_col_indices are the CSR representation of block mask. The block_col_indices might contain
+ paddings at the right side when different layout has different number of non-zeros in block mask.
+
+ An example of block mask with 2 layouts where each layout is 4 x 4 blocks:
+ [[[1, 0, 0, 0],
+ [1, 1, 0, 0],
+ [0, 1, 1, 0],
+ [0, 1, 1, 1]],
+
+ [[1, 0, 0, 0],
+ [1, 1, 0, 0],
+ [1, 1, 1, 0],
+ [1, 0, 1, 1]]]
+
+ The corresponding CSR format:
+ block_col_indices = [[0, 0, 1, 1, 2, 1, 2, 3, -1], [0, 0, 1, 0, 1, 2, 0, 2, 3]]
+ block_row_indices = [[0, 1, 3, 5, 8], [0, 1, 3, 6, 9]]
+
+ When do_rotary is True, cos_cache and sin_cache are required. Note that the maximum sequence length supported by cos
+ or sin cache can be different from the maximum sequence length used by kv cache.
+
+ Only supports unidirectional attention with cache of past key and value in linear buffers.
+
+ For performance, past_key and present_key share same memory buffer, and past_value and present_value too.
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+#### Attributes
+
+
+do_rotary : int
+Whether to use rotary position embedding. Default value is 0.
+kv_num_heads : int (required)
+Number of attention heads for key and value
+num_heads : int (required)
+Number of attention heads for query
+rotary_interleaved : int
+Rotary use interleaved pattern or not. Default value is 0.
+scale : float
+Scaling factor applied prior to softmax. The default value is 1/sqrt(head_size)
+sparse_block_size : int (required)
+Number of tokens per sparse block. Choices: 16, 32, 64, 128
+
+
+#### Inputs (9 - 11)
+
+
+query : T
+Query with shape (batch_size, sequence_length, num_heads * head_size), or packed QKV with shape is(batch_size, sequence_length, d) where d is (num_heads + 2 * kv_num_heads) * head_size.
+key (optional) : T
+Key with shape (batch_size, sequence_length, kv_num_heads * head_size)
+value (optional) : T
+Value with shape (batch_size, sequence_length, kv_num_heads * head_size)
+past_key : T
+Key cache with shape (batch_size, kv_num_heads, max_cache_sequence_length, head_size)
+past_value : T
+Value cache with shape (batch_size, kv_num_heads, max_cache_sequence_length, head_size)
+block_row_indices : M
+The row indices of CSR format of block mask with shape (num_layout, max_blocks + 1).The num_heads is divisible by num_layout, and max_blocks is max_sequence_length / sparse_block_size.
+block_col_indices : M
+The col indices of CSR format of block mask with shape (num_layout, max_nnz_blocks).The max_nnz_blocks is the maximum number of non-zeros per layout in block mask.
+total_sequence_length : M
+Scalar tensor of maximum total sequence length (past_sequence_length + sequence_length) among keys.
+key_total_sequence_lengths : M
+1D tensor with shape (batch_size) where each value is total sequence length of key excluding paddings.
+cos_cache (optional) : T
+Cos cache of rotary with shape (max_rotary_sequence_length, head_size / 2).
+sin_cache (optional) : T
+Sin cache of rotary with shape (max_rotary_sequence_length, head_size / 2).
+
+
+#### Outputs
+
+
+output : T
+3D output tensor with shape (batch_size, sequence_length, num_heads * head_size)
+present_key : T
+Updated key cache with shape (batch_size, kv_num_heads, max_cache_sequence_length, head_size).
+present_value : T
+Updated value cache with shape (batch_size, kv_num_heads, max_cache_sequence_length, head_size).
+
+
+#### Type Constraints
+
+
+T : tensor(float16), tensor(bfloat16)
+Constrain input and output to float tensors.
+M : tensor(int32)
+Constrain integer type.
+
+
+
### **com.microsoft.SparseToDenseMatMul**
#### Version
@@ -5723,12 +5997,14 @@ This version of the operator has been available since version 1 of the 'com.micr
#### Attributes
+beginning_timestamp_token_id : int
+The id of the first timestamp
decoder : graph (required)
Decoder subgraph to execute in a loop.
decoder_output_cross_qk : int
If nozero, decoder subgraph contains output Q*K from cross attentions. Default 0.
decoder_start_token_id : int
-The id of the token that indicates decoding starts.
+The id of the token that indicates decoding starts (i.e. the start of transcription token id)
early_stopping : int
early stop or not
encoder : graph
@@ -5741,15 +6017,23 @@ This version of the operator has been available since version 1 of the 'com.micr
Must be 2 for whisper
no_repeat_ngram_size : int
no repeat ngrams size
-no_speech_token : int
+no_speech_token_id : int
The token in whisper model that marks all sequence empty. With this model, whisper could output no_speech_prob after. Default -1.
+no_timestamps_token_id : int
+The id of the token that indicates no timestamps
pad_token_id : int (required)
The id of the padding token
+start_of_lm_token_id : int
+The id of the token that indicates LM starts
+transcribe_token_id : int
+The id of the transcribe task
+translate_token_id : int
+The id of the translate task
vocab_size : int
Size of the vocabulary. If not provided, it will be inferred from the decoder subgraph's output shape
-#### Inputs (5 - 14)
+#### Inputs (5 - 15)
input_ids : F
@@ -5763,11 +6047,11 @@ This version of the operator has been available since version 1 of the 'com.micr
num_return_sequences : I
The number of returned sequences in the batch. Shape is (1)
length_penalty (optional) : T
-Exponential penalty to the length. Default value 1.0 means no penalty.Value > 1.0 encourages longer sequences, while values < 1.0 produces shorter sequences.Shape is (1,)
+Exponential penalty to the length. Default value 1.0 means no penalty. Value > 1.0 encourages longer sequences, while values < 1.0 produces shorter sequences. Shape is (1,)
repetition_penalty (optional) : T
The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)
vocab_mask (optional) : M
-Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)
+Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)
prefix_vocab_mask (optional) : M
Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)
attention_mask (optional) : I
@@ -5777,9 +6061,11 @@ This version of the operator has been available since version 1 of the 'com.micr
logits_processor (optional) : I
Specific logits processor for different types of beamsearch models. Default value 0 means no specific logit processor. Accepts value >= 0. Shape is (1)
cross_qk_layer_head (optional) : I
-Only keep this list of (layer, head) of QK in the final cross_qk output when use_cross_qk is set. Default collect allits shape is (number of (layer, head) to keep, 2), i.e., [[layer_id1, head_id1], [layer_id2, head_id2]......]
+Only keep this list of (layer, head) of QK in the final cross_qk output when use_cross_qk is set. Default collect all its shape is (number of (layer, head) to keep, 2), i.e., [[layer_id1, head_id1], [layer_id2, head_id2]......]
extra_decoding_ids (optional) : I
Part of the decoder_input_ids that we need cross qk for it. it is of shape (batch_size, extra_decoding_ids_len).In such case, we should remove this from the tail of the decoder_input_ids, and put it here. ids < 0 in it (for multiple batch) are treated as stop of the extra_decoding_ids for corresponding batch.
+temperature (optional) : T
+Temperature value to apply to logits processing during this execution's decoding. Shape is (1)
#### Outputs (1 - 5)
@@ -5790,11 +6076,11 @@ This version of the operator has been available since version 1 of the 'com.micr
sequences_scores (optional) : T
Final beam score of the generated sequences. Shape is (batch_size, num_return_sequences)
scores (optional) : T
-Processed beam scores for each vocabulary token at each generation step.Beam scores consisting of log softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam.Shape is (max_length - sequence_length, batch_size, num_beams, vocab_size)
+Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam. Shape is (max_length - sequence_length, batch_size, num_beams, vocab_size)
cross_qk (optional) : V
-Output the accumulated stacked Q*K in cross attentions. Let H = number of Head of cross attention, F = the frames or kv-seq-len of the cross attention input, T = real decoded token length, L = number of layers,B = batch size, R = num_return_sequences. It then should return tensor of shape [B, R, L*H, T, F].If cross_qk_layer_head is given, shape is [B, R, cross_qk_layer_head.shape[0], T, F]
+Output the accumulated stacked Q*K in cross attentions. Let H = number of Head of cross attention, F = the frames or kv-seq-len of the cross attention input, T = real decoded token length, L = number of layers, B = batch size, R = num_return_sequences. It then should return tensor of shape [B, R, L*H, T, F]. If cross_qk_layer_head is given, shape is [B, R, cross_qk_layer_head.shape[0], T, F]
non_speech_probs (optional) : T
-For whisper model, output the probabilities from logits after encoder and context decoding for the no_speech_token.Currently we treat the last token's logits is what we need, in future extra graph logic may be add to the encoder/context-decoder subgraph.The prob is save before logits may be updated by extra-decoding-ids. The shape of non_speech_probs is [B]
+For whisper model, output the probabilities from logits after encoder and context decoding for the no_speech_token_id. The shape of non_speech_probs is [B]
#### Type Constraints
diff --git a/docs/How_To_Update_ONNX_Dev_Notes.md b/docs/How_To_Update_ONNX_Dev_Notes.md
index fd787b017617e..264c620a8e693 100644
--- a/docs/How_To_Update_ONNX_Dev_Notes.md
+++ b/docs/How_To_Update_ONNX_Dev_Notes.md
@@ -17,9 +17,12 @@ git add onnx
1. Update [cgmanifests/generated/cgmanifest.json](/cgmanifests/generated/cgmanifest.json).
This file should be generated. See [cgmanifests/README](/cgmanifests/README.md) for instructions.
-1. Update [tools/ci_build/github/linux/docker/scripts/requirements.txt](/tools/ci_build/github/linux/docker/scripts/requirements.txt)
- and [tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt](/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt).
- Update the commit hash for `git+http://github.com/onnx/onnx.git@targetonnxcommithash#egg=onnx`.
+1. Update Python requirements files with the updated ONNX version (e.g., `onnx==1.16.0`) or commit hash if building from source (e.g., `git+http://github.com/onnx/onnx.git@targetonnxcommithash#egg=onnx`).
+- [onnxruntime/test/python/requirements.txt](/onnxruntime/test/python/requirements.txt)
+- [tools/ci_build/github/linux/docker/scripts/requirements.txt](/tools/ci_build/github/linux/docker/scripts/requirements.txt)
+- [tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt](/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt)
+- [tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt](/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt)
+- Run `git grep -rn "onnx==1" .` to find other locations and update this document if necessary.
1. If there is any change to `cmake/external/onnx/onnx/*.in.proto`, you need to regenerate OnnxMl.cs.
[Building onnxruntime with Nuget](https://onnxruntime.ai/docs/build/inferencing.html#build-nuget-packages) will do
diff --git a/docs/Memory_Optimizer.md b/docs/Memory_Optimizer.md
index 97f7e7ff2c14b..f8e015c3db9e4 100644
--- a/docs/Memory_Optimizer.md
+++ b/docs/Memory_Optimizer.md
@@ -30,16 +30,23 @@ Integrate models using `ORTModule`.
```
There are two modes to enable the memory optimizations:
-- Aggressively Recompute All Within Each Transformer Layer, enabled by `export ORTMODULE_MEMORY_OPT_LEVEL=1`. This will recompute all detected subgraphs within each Transformer Attention+MLP layer. It is easy to enable, but be noted this recompute plan may NOT be the best one. In this mode, `ORTMODULE_MEMORY_OPT_CONFIG` env values passed by users are not respected.
-- User Specified Subgraph Recompute, enabled by `export ORTMODULE_MEMORY_OPT_LEVEL=0` and `export ORTMODULE_MEMORY_OPT_CONFIG=,,...`. This is an advanced usage, that allows users to find the most suitable graphs to recompute, at the cost of overhead to look for the best plans.
+- Transformer layerwise recompute, e.g. aggressively recompute all supported nodes within each transformer layer (usually including attention and mlp sublayers), enabled by `export ORTMODULE_MEMORY_OPT_LEVEL=1`. In this mode, `ORTMODULE_MEMORY_OPT_CONFIG` env values passed by users are not respected.
+- Manual selected subgraph recompute, enabled by `export ORTMODULE_MEMORY_OPT_LEVEL=0` and `export ORTMODULE_MEMORY_OPT_CONFIG=`. This is an advanced usage, that allows users to find the most suitable graphs to recompute, at the cost of overhead to look for the best plans. The format for its content is:
+ ```
+ [
+ "",
+ "",
+ ...
+ ]
+ ```
-### Mode 1 - Simple Usage (Aggressively Recompute All Within Each Transformer Layer)
+### Mode 1 - Simple Usage (Transformer Layerwise Recompute)
1. Set memory optimization level to be TRANSFORMER_LAYERWISE_RECOMPUTE, by `export ORTMODULE_MEMORY_OPT_LEVEL=1`
2. Run the training as usual; check the logs, you could find something like this if the current log level <= LogLevel.INFO:
```
- Memory Optimizer : ON : Memory Optimization Level: [TRANSFORMER_LAYERWISE_RECOMPUTE], Optimization Config: [Reshape+Where+:1:-1,BiasSoftmax+:1:-1,Cast+:1:-1,BiasGelu+:1:-1,FusedMatMul+:1:-1,Add+:1:-1,Reshape+Unsqueeze+Unsqueeze+Cast+Sub+Mul+Cast+:1:-1]
+ Memory Optimizer : ON : Memory Optimization Level: [TRANSFORMER_LAYERWISE_RECOMPUTE], Optimization Config: mem_opt.json
Configs Freq Max Saving(Bytes) Saving Symbolic(Bytes)
- Plan 1 : ON : Reshape+Where+:1:-1 1 134,217,728 128.0*inputs_input_ids_dim0*inputs_input_ids_dim1**2
- Plan 2 : ON : BiasSoftmax+:1:-1 1 134,086,656 128.0*inputs_input_ids_dim0*inputs_input_ids_dim1*(inputs_input_ids_dim1 - 1)
@@ -51,6 +58,7 @@ There are two modes to enable the memory optimizations:
- Plan 8 : OFF : Cast+:2:-1 1 2,048 2.0*inputs_input_ids_dim0*inputs_input_ids_dim1
```
3. As shown above, `Config` is a string representative for a re-computable subgraph. All are enabled for recompute in this case.
+4. By `export ORTMODULE_MEMORY_OPT_LEVEL=2`, all plans including compromised recomptable subgraphs will also be enabled.
### Mode 2 - Advanced Usage (User Selected Subgraph Recompute)
@@ -58,7 +66,7 @@ There are two modes to enable the memory optimizations:
1. Be noted `ORTMODULE_MEMORY_OPT_LEVEL` is by default be 0. Run the training as usual; then stop it after training a few steps.
2. Check the logs, you could find something like this if the current log level <= LogLevel.INFO::
```
- Memory Optimizer : OFF : Enable with env ORTMODULE_MEMORY_OPT_LEVEL=1 or ORTMODULE_MEMORY_OPT_CONFIG=,,...
+ Memory Optimizer : OFF : Enable with env ORTMODULE_MEMORY_OPT_LEVEL=1 or ORTMODULE_MEMORY_OPT_CONFIG=
Configs Freq Max Saving(Bytes) Saving Symbolic(Bytes)
- Plan 1 : OFF : Reshape+Where+:1:-1 1 134,217,728 128.0*inputs_input_ids_dim0*inputs_input_ids_dim1**2
- Plan 2 : OFF : BiasSoftmax+:1:-1 1 134,086,656 128.0*inputs_input_ids_dim0*inputs_input_ids_dim1*(inputs_input_ids_dim1 - 1)
@@ -72,8 +80,15 @@ There are two modes to enable the memory optimizations:
3. As shown above, `Config` is a string representative for a re-computable subgraph. All are disabled for recompute in this case.
4. Set environment variable `ORTMODULE_MEMORY_OPT_CONFIG` to enable some of the subgraphs to do recompute.
```bash
- # Use comma as a separator for enabling more than one subgraphs.
- export ORTMODULE_MEMORY_OPT_CONFIG="BiasGelu+:1:1"
+ export ORTMODULE_MEMORY_OPT_CONFIG="mem_opt.json"
+
+ # Content of mem_opt.json:
+ [
+ "BiasGelu+:1:1",
+ "Dropout+:1:-1"
+ ]
+ # Use comma as a separator for enabling more than one subgraphs in the json file.
+
# Explanation:
# > BiasGelu+ is the subgraph string representative;
# > 1 in the middle indicates 'Recompute' is enabled (0, on the contrary indicates it's disabled)
@@ -82,7 +97,7 @@ There are two modes to enable the memory optimizations:
```
5. Then run the training again, and you will see logs like this:
```
- Memory Optimizer : ON : Memory Optimization Level: [USER_SPECIFIED], Optimization Config: [BiasGelu+:1:-1]
+ Memory Optimizer : ON : Memory Optimization Level: [USER_SPECIFIED], Optimization Config: mem_opt.json
Configs Freq Max Saving(Bytes) Saving Symbolic(Bytes)
- Plan 1 : OFF : Reshape+Where+:1:-1 1 134,217,728 128.0*inputs_input_ids_dim0*inputs_input_ids_dim1**2
- Plan 2 : OFF : BiasSoftmax+:1:-1 1 134,086,656 128.0*inputs_input_ids_dim0*inputs_input_ids_dim1*(inputs_input_ids_dim1 - 1)
@@ -126,7 +141,7 @@ MemoryInsight Summary - User config: not provided
|6 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph FusedMatMul+Add+Reshape+ |
-| | Status : Disabled. Enable with export ORTMODULE_MEMORY_OPT_CONFIG=FusedMatMul+Add+Reshape+:1:-1 |
+| | Status : Disabled. |
| | Stashed Activations: |
| | - ReuseFreq : Output 0(6), |
| | - Output 0 : [((inputs_input_ids_dim0)*(inputs_input_ids_dim1)*(32)*(240))], byte/elem: 2, 100% saved |
@@ -134,26 +149,26 @@ MemoryInsight Summary - User config: not provided
|5 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph FusedMatMul+ |
-| | Status : Disabled. Enable with export ORTMODULE_MEMORY_OPT_CONFIG=FusedMatMul+:1:-1 |
+| | Status : Disabled. |
| | Stashed Activations: |
| | - Output 0 : [((inputs_input_ids_dim0)*(inputs_input_ids_dim1)*(10240))], byte/elem: 2, 100% saved |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _|
|5 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph Cast+ |
-| | Status : Disabled. Enable with export ORTMODULE_MEMORY_OPT_CONFIG=Cast+:1:-1 |
+| | Status : Disabled. |
| | Stashed Activations: |
| | - Output 0 : [((inputs_input_ids_dim0)*(32)*(inputs_input_ids_dim1)*(inputs_input_ids_dim1))], byte/elem: 2, 100% saved |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _|
|1 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph Reshape+Unsqueeze+Unsqueeze+Cast+Sub+Mul+Cast+ |
-| | Status : Disabled. Enable with export ORTMODULE_MEMORY_OPT_CONFIG=Reshape+Unsqueeze+Unsqueeze+Cast+Sub+Mul+Cast+:1:-1 |
+| | Status : Disabled. |
| | Stashed Activations: |
| | - Output 0 : [((inputs_input_ids_dim0)*(1)*(1)*(inputs_input_ids_dim1))], byte/elem: 4, 100% saved |
| | |
| |>>Option 2 : RecomputeWithCompromise subgraph Cast+ |
-| | Status : Disabled. Enable with export ORTMODULE_MEMORY_OPT_CONFIG=Cast+:2:-1 |
+| | Status : Disabled. |
| | Stashed Activations: |
| | - Output 0 : [((inputs_input_ids_dim0)*(1)*(1)*(inputs_input_ids_dim1))], byte/elem: 4, 50% saved |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _|
diff --git a/docs/Model_Test.md b/docs/Model_Test.md
index 960900d1c2b83..48ee3c3835899 100644
--- a/docs/Model_Test.md
+++ b/docs/Model_Test.md
@@ -1,27 +1,20 @@
ONNX has a collection of standard tests. This document describes how to run these tests through a C++ program named 'onnx_test_runner' in this repo. You could also run these test through onnxruntime python binding, which would be much easier to setup, but, a bit harder to debug issues.
# Get the test data
-You should have:
-1. onnx single node test data
-2. onnx model zoo models
-
-## Install onnx python package
-You can get onnx python package from [pypi](https://pypi.org/). However, if you are a onnxruntime developer, you may need to work on a cutting edge ONNX version. In this case, you need to build and install ONNX from source code.
-
-### Install ONNX from source code
-1. (windows) set ONNX_ML=1
- (linux) export ONNX_ML=1
-2. Install protobuf and put protoc into your PATH environment. When you compile protobuf, it's better to only enable the static libraries.
-3. run "python setup.py bdist_wheel" and "pip install dist/*.whl"
-
-## Generate node test data
-$ python3 -m onnx.backend.test.cmd_tools generate-data -o
-e.g.
- python3 -m onnx.backend.test.cmd_tools generate-data -o C:\testdata
-
-
-## Get more models
-Download https://onnxruntimetestdata.blob.core.windows.net/models/20190419.zip and unzip it.
+```
+git submodule update --init --recursive
+pushd .
+cd cmake/external/emsdk
+./emsdk install latest
+./emsdk activate latest
+source ./emsdk_env.sh
+popd
+cd js
+npm install
+npm run prepare-node-tests
+```
+
+In addition to that, You can get more test models with their test data from https://github.com/onnx/models .
# Compile onnx_test_runner and run the tests
diff --git a/docs/ORTModule_Convergence_Notes.md b/docs/ORTModule_Convergence_Notes.md
index 791b6c32c9b48..2374e7b7c538a 100644
--- a/docs/ORTModule_Convergence_Notes.md
+++ b/docs/ORTModule_Convergence_Notes.md
@@ -89,7 +89,7 @@ The limitation of `GlobalSubscriberManager` is, only 'nn.Module's forward output
dump the intermediate tensors in a `nn.Module`'s forward function, refer to the following example:
```diff
-+ from onnxruntime.training.utils import inspect_activation
++ from onnxruntime.training.utils.hooks import inspect_activation
class BloomForCausalLM(BloomPreTrainedModel):
def __init__(self, config: BloomConfig):
...
diff --git a/docs/ORTModule_Training_Guidelines.md b/docs/ORTModule_Training_Guidelines.md
index bede16204d420..8d5472ba30601 100644
--- a/docs/ORTModule_Training_Guidelines.md
+++ b/docs/ORTModule_Training_Guidelines.md
@@ -208,19 +208,6 @@ debugging).
export ORTMODULE_ENABLE_COMPUTE_OPTIMIZER=0 # Disable
```
-#### ORTMODULE_ENABLE_SPARSE_OPTIMIZER
-
-- **Feature Area**: *ORTMODULE/Optimizations*
-- **Description**: By default, this is enabled. This env var can be used for enabling or disabling the input data sparsity
-based performance optimizations, including embedding sparsity and label sparsity.
-This optimization is applicable when using optimum, which has an implementation of the ModuleWithLoss class that wraps the HuggingFace Training that allows loss computation inside ONNX Runtime (ORT).
-If you're not using optimum but want to implement a similar wrapper in your codebase to compute the loss inside ONNX Runtime (ORT), you can refer to this [Link](ORTModule_ModuleWithLoss_Wrapper.md) for detailed steps and guidelines on how to achieve this.
-
- ```bash
- export ORTMODULE_ENABLE_SPARSE_OPTIMIZER=1 # Enable
- export ORTMODULE_ENABLE_SPARSE_OPTIMIZER=0 # Disable
- ```
-
#### ORTMODULE_PRINT_INPUT_DENSITY
- **Feature Area**: *ORTMODULE/RuntimeInspector*
@@ -246,7 +233,7 @@ to standard outputs.
#### ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER
- **Feature Area**: *ORTMODULE/Optimizations*
-- **Description**: By default, this is disabled. This env var can be used for enabling or disabling the embedding input
+- **Description**: By default, this is enabled. This env var can be used for enabling or disabling the embedding input
data sparsity based performance optimizations.
```bash
@@ -254,6 +241,17 @@ data sparsity based performance optimizations.
export ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER=0 # Disable
```
+#### ORTMODULE_ENABLE_LABEL_SPARSE_OPTIMIZER
+
+- **Feature Area**: *ORTMODULE/Optimizations*
+- **Description**: By default, this is enabled. This env var can be used for enabling or disabling the label input
+data sparsity based performance optimizations.
+
+ ```bash
+ export ORTMODULE_ENABLE_LABEL_SPARSE_OPTIMIZER=1 # Enable
+ export ORTMODULE_ENABLE_LABEL_SPARSE_OPTIMIZER=0 # Disable
+ ```
+
#### ORTMODULE_CACHE_DIR
- **Feature Area**: *ORTMODULE/RuntimeOptions*
@@ -287,12 +285,25 @@ A classical usage of disabling the deep copy: when the deep copy before module e
#### ORTMODULE_MEMORY_OPT_LEVEL
- **Feature Area**: *ORTMODULE/Optimizations*
-- **Description**: By default, the level is 0. This env var can be used for enabling recomputation for reducing memory peak requirement. Setting the level to be 0 means all detected subgraphs with each transformer-based model layer generating stashed activations will be recomputed. This is conceptually equivalent to PyTorch's gradient checkpoint. When level is not 0, check Check [Memory Optimizer for ONNX Runtime Training](Memory_Optimizer.md) for more details.
+- **Description**: By default, the level is 0. This env var can be used for enabling recomputation for reducing memory peak requirement.
+ - Setting the level to be 1 means all detected recomputable subgraphs (NOT including compromised recomputable graphs) with each transformer-based model layer generating stashed activations will be recomputed. This is conceptually equivalent to PyTorch's gradient checkpoint.
+ - Setting the level to be 2 means all detected recomputable subgraphs (including compromised recomputable graphs) with each transformer-based model layer generating stashed activations will be recomputed. This is conceptually equivalent to PyTorch's gradient checkpoint.
+ - When the level is 0, check Check [Memory Optimizer for ONNX Runtime Training](Memory_Optimizer.md) for more details.
```bash
export ORTMODULE_MEMORY_OPT_LEVEL=0
```
+#### ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT
+
+- **Feature Area**: *ORTMODULE/Optimizations*
+- **Description**: By default, the memory-efficient gradient management is turned off. The gradient after it is computed in ONNX Runtime, will trigger the corresponding parameter's backward function through `PythonOpGrad` operator. This would help release the gradient buffer managed in ONNX Runtime, which originally is released once all backward computation finishes.
+
+ ```bash
+ export ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT=1 # Enable
+ export ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT=0 # Disable
+ ```
+
### 2.2 Memory Optimization
Q: *Want to run a bigger batch size?*
@@ -482,3 +493,31 @@ for epoch in range(start_epoch, n_epochs):
```
Check [LoadBalancingDistributedBatchSampler implementation](../orttraining/orttraining/python/training/utils/data/sampler.py) for more details.
+
+## 8 Using ORTPipelineModule for Deepspeed Pipeline Parallel
+
+You can use `ORTPipelineModule` to support Deepspeed Pipeline Parallelism. Here's how you can integrate it into your pipeline:
+
+```python
+from onnxruntime.training.ortmodule import DebugOptions
+from onnxruntime.training.ortmodule.experimental.pipe import ORTPipelineModule
+
+# Create a debug configuration if needed
+# Since we're exporting multiple graphs here, this will generate multiple graphs with their index added as a prefix to differentiate them.
+
+debug_options = DebugOptions(save_onnx=True, log_level=LogLevel.VERBOSE, onnx_prefix="model_name")
+
+# Keep your deepspeed script the same and use ORTPipelineModule instead of PipelineModule
+# Initialize the ORTPipelineModule
+pipeline_module = ORTPipelineModule(
+ layers,
+ num_stages=2, # Set your number of stages
+ base_seed=1234,
+ partition_method="parameters",
+ debug_options=debug_options # Pass the debug configuration if needed
+)
+
+# Keep the rest of the script as it is.
+```
+
+Check [ORTPipelineModule implementation](../orttraining/orttraining/python/training/ortmodule/experimental/pipe/_ort_pipeline_module.py) for more details.
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index edf249a816923..8092c26da651a 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -51,7 +51,8 @@ Do not modify directly.*
|BitwiseOr|*in* A:**T** *in* B:**T** *out* C:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|BitwiseXor|*in* A:**T** *in* B:**T** *out* C:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|BlackmanWindow|*in* size:**T1** *out* output:**T2**|17+|**T1** = tensor(int32), tensor(int64) **T2** = tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Cast|*in* input:**T1** *out* output:**T2**|19+|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T2** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Cast|*in* input:**T1** *out* output:**T2**|21+|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T2** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[19, 20]|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T2** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[13, 18]|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T2** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[6, 12]|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T2** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|Ceil|*in* X:**T** *out* Y:**T**|13+|**T** = tensor(double), tensor(float)|
@@ -68,8 +69,9 @@ Do not modify directly.*
|||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[4, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|ConcatFromSequence|*in* input_sequence:**S** *out* concat_result:**T**|11+|**S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
-|ConstantOfShape|*in* input:**T1** *out* output:**T2**|20+|**T1** = tensor(int64) **T2** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||[9, 19]|**T1** = tensor(int64) **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|ConstantOfShape|*in* input:**T1** *out* output:**T2**|21+|**T1** = tensor(int64) **T2** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||20|**T1** = tensor(int64) **T2** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[9, 19]|**T1** = tensor(int64) **T2** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|Conv|*in* X:**T** *in* W:**T** *in* B:**T** *out* Y:**T**|11+|**T** = tensor(float)|
|||[1, 10]|**T** = tensor(float)|
|ConvInteger|*in* x:**T1** *in* w:**T2** *in* x_zero_point:**T1** *in* w_zero_point:**T2** *out* y:**T3**|10+|**T1** = tensor(uint8) **T2** = tensor(uint8) **T3** = tensor(int32)|
@@ -80,11 +82,13 @@ Do not modify directly.*
|Crop|*in* input:**T** *out* output:**T**|1+|**T** = tensor(float)|
|CumSum|*in* x:**T** *in* axis:**T2** *out* y:**T**|14+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64) **T2** = tensor(int32), tensor(int64)|
|||[11, 13]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64) **T2** = tensor(int32), tensor(int64)|
-|DFT|*in* input:**T1** *in* dft_length:**T2** *in* axis:**tensor(int64)** *out* output:**T1** or *in* input:**T1** *in* dft_length:**T2** *out* output:**T1**|17+|**T1** = tensor(double), tensor(float) **T2** = tensor(int32), tensor(int64)|
+|DFT|*in* input:**T1** *in* dft_length:**T2** *in* axis:**tensor(int64)** *out* output:**T1** or *in* input:**T1** *in* dft_length:**T2** *out* output:**T1**|20+|**T1** = tensor(double), tensor(float) **T2** = tensor(int32), tensor(int64)|
+|||[17, 19]|**T1** = tensor(double), tensor(float) **T2** = tensor(int32), tensor(int64)|
|DepthToSpace|*in* input:**T** *out* output:**T**|13+|**T** = tensor(double), tensor(float)|
|||[11, 12]|**T** = tensor(double), tensor(float)|
|||[1, 10]|**T** = tensor(double), tensor(float)|
-|DequantizeLinear|*in* x:**T** *in* x_scale:**tensor(float)** *in* x_zero_point:**T** *out* y:**tensor(float)** or *in* x:**T1** *in* x_scale:**T2** *in* x_zero_point:**T1** *out* y:**T2**|19+|**T1** = tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int32), tensor(int8), tensor(uint8) **T2** = tensor(float), tensor(float16)|
+|DequantizeLinear|*in* x:**T** *in* x_scale:**tensor(float)** *in* x_zero_point:**T** *out* y:**tensor(float)** or *in* x:**T1** *in* x_scale:**T2** *in* x_zero_point:**T1** *out* y:**T2**|21+|**T1** = tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int4), tensor(int8), tensor(uint16), tensor(uint4), tensor(uint8) **T2** = tensor(float), tensor(float16)|
+|||[19, 20]|**T1** = tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int32), tensor(int8), tensor(uint8) **T2** = tensor(float), tensor(float16)|
|||[13, 18]|**T** = tensor(int32), tensor(int8), tensor(uint8)|
|||[10, 12]|**T** = tensor(int32), tensor(int8), tensor(uint8)|
|Det|*in* X:**T** *out* Y:**T**|11+|**T** = tensor(float)|
@@ -110,7 +114,8 @@ Do not modify directly.*
|Expand|*in* input:**T** *in* shape:**tensor(int64)** *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[8, 12]|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|EyeLike|*in* input:**T1** *out* output:**T2**|9+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(uint64) **T2** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(uint64)|
-|Flatten|*in* input:**T** *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Flatten|*in* input:**T** *out* output:**T**|21+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[13, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[9, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[1, 8]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -126,6 +131,7 @@ Do not modify directly.*
|GatherND|*in* data:**T** *in* indices:**tensor(int64)** *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **indices** = tensor(int64)|
|||12|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **indices** = tensor(int64)|
|||11|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **indices** = tensor(int64)|
+|Gelu|*in* X:**T** *out* Y:**T**|20+|**T** = tensor(float)|
|Gemm|*in* A:**T** *in* B:**T** *in* C:**T** *out* Y:**T**|13+|**T** = tensor(double), tensor(float)|
|||[11, 12]|**T** = tensor(double), tensor(float)|
|||[9, 10]|**T** = tensor(double), tensor(float)|
@@ -146,21 +152,23 @@ Do not modify directly.*
|Hardmax|*in* input:**T** *out* output:**T**|13+|**T** = tensor(float)|
|||[11, 12]|**T** = tensor(float)|
|||[1, 10]|**T** = tensor(float)|
-|Identity|*in* input:**T** *out* output:**T** or *in* input:**V** *out* output:**V**|19+|**V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Identity|*in* input:**T** *out* output:**T** or *in* input:**V** *out* output:**V**|21+|**V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[19, 20]|**V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[16, 18]|**V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[14, 15]|**V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||13|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[1, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|If|*in* cond:**B** *out* outputs:**V**|19+|**B** = tensor(bool) **V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|If|*in* cond:**B** *out* outputs:**V**|21+|**B** = tensor(bool) **V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[19, 20]|**B** = tensor(bool) **V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[16, 18]|**B** = tensor(bool) **V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[13, 15]|**B** = tensor(bool) **V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[11, 12]|**B** = tensor(bool) **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[1, 10]|**B** = tensor(bool) **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|ImageScaler|*in* input:**T** *out* output:**T**|1+|**T** = tensor(float)|
|InstanceNormalization|*in* input:**T** *in* scale:**T** *in* B:**T** *out* output:**T**|6+|**T** = tensor(float)|
-|IsInf|*in* X:**T1** *out* Y:**T2**|20+|**T1** = tensor(double), tensor(float), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz) **T2** = tensor(bool)|
+|IsInf|*in* X:**T1** *out* Y:**T2**|20+|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz) **T2** = tensor(bool)|
|||[10, 19]|**T1** = tensor(double), tensor(float) **T2** = tensor(bool)|
-|IsNaN|*in* X:**T1** *out* Y:**T2**|20+|**T1** = tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz) **T2** = tensor(bool)|
+|IsNaN|*in* X:**T1** *out* Y:**T2**|20+|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz) **T2** = tensor(bool)|
|||[13, 19]|**T1** = tensor(double), tensor(float), tensor(float16) **T2** = tensor(bool)|
|||[9, 12]|**T1** = tensor(double), tensor(float), tensor(float16) **T2** = tensor(bool)|
|LRN|*in* X:**T** *out* Y:**T**|13+|**T** = tensor(float)|
@@ -181,7 +189,8 @@ Do not modify directly.*
|LogSoftmax|*in* input:**T** *out* output:**T**|13+|**T** = tensor(double), tensor(float)|
|||[11, 12]|**T** = tensor(double), tensor(float)|
|||[1, 10]|**T** = tensor(double), tensor(float)|
-|Loop|*in* M:**I** *in* cond:**B** *in* v_initial:**V** *out* v_final_and_scan_outputs:**V**|19+|**B** = tensor(bool) **I** = tensor(int64) **V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Loop|*in* M:**I** *in* cond:**B** *in* v_initial:**V** *out* v_final_and_scan_outputs:**V**|21+|**B** = tensor(bool) **I** = tensor(int64) **V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[19, 20]|**B** = tensor(bool) **I** = tensor(int64) **V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[16, 18]|**B** = tensor(bool) **I** = tensor(int64) **V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[13, 15]|**B** = tensor(bool) **I** = tensor(int64) **V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[11, 12]|**B** = tensor(bool) **I** = tensor(int64) **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -237,7 +246,8 @@ Do not modify directly.*
|PRelu|*in* X:**T** *in* slope:**T** *out* Y:**T**|16+|**T** = tensor(float)|
|||[9, 15]|**T** = tensor(float)|
|||[7, 8]|**T** = tensor(float)|
-|Pad|*in* data:**T** *in* pads:**tensor(int64)** *in* constant_value:**T** *in* axes:**Tind** *out* output:**T** or *in* data:**T** *in* pads:**tensor(int64)** *in* constant_value:**T** *out* output:**T** or *in* data:**T** *out* output:**T**|19+|**T** = tensor(bool), tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Pad|*in* data:**T** *in* pads:**tensor(int64)** *in* constant_value:**T** *in* axes:**Tind** *out* output:**T** or *in* data:**T** *in* pads:**tensor(int64)** *in* constant_value:**T** *out* output:**T** or *in* data:**T** *out* output:**T**|21+|**T** = tensor(bool), tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[19, 20]|**T** = tensor(bool), tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
|||18|**T** = tensor(bool), tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[13, 17]|**T** = tensor(bool), tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[11, 12]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -248,8 +258,9 @@ Do not modify directly.*
|||12|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64) **T1** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
|||[7, 11]|**T** = tensor(double), tensor(float)|
|QLinearConv|*in* x:**T1** *in* x_scale:**tensor(float)** *in* x_zero_point:**T1** *in* w:**T2** *in* w_scale:**tensor(float)** *in* w_zero_point:**T2** *in* y_scale:**tensor(float)** *in* y_zero_point:**T3** *in* B:**T4** *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8) **T2** = tensor(int8), tensor(uint8) **T3** = tensor(int8), tensor(uint8) **T4** = tensor(int32)|
-|QLinearMatMul|*in* a:**T1** *in* a_scale:**tensor(float)** *in* a_zero_point:**T1** *in* b:**T2** *in* b_scale:**tensor(float)** *in* b_zero_point:**T2** *in* y_scale:**tensor(float)** *in* y_zero_point:**T3** *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8) **T2** = tensor(int8), tensor(uint8) **T3** = tensor(int8), tensor(uint8)|
-|QuantizeLinear|*in* x:**T1** *in* y_scale:**T1** *in* y_zero_point:**T2** *out* y:**T2** or *in* x:**T1** *in* y_scale:**tensor(float)** *in* y_zero_point:**T2** *out* y:**T2**|19+|**T1** = tensor(float), tensor(float16) **T2** = tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int8), tensor(uint8)|
+|QLinearMatMul|*in* a:**T1** *in* a_scale:**TS** *in* a_zero_point:**T1** *in* b:**T2** *in* b_scale:**TS** *in* b_zero_point:**T2** *in* y_scale:**TS** *in* y_zero_point:**T3** *out* y:**T3** or *in* a:**T1** *in* a_scale:**tensor(float)** *in* a_zero_point:**T1** *in* b:**T2** *in* b_scale:**tensor(float)** *in* b_zero_point:**T2** *in* y_scale:**tensor(float)** *in* y_zero_point:**T3** *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8) **T2** = tensor(int8), tensor(uint8) **T3** = tensor(int8), tensor(uint8)|
+|QuantizeLinear|*in* x:**T1** *in* y_scale:**T1** *in* y_zero_point:**T2** *out* y:**T2** or *in* x:**T1** *in* y_scale:**tensor(float)** *in* y_zero_point:**T2** *out* y:**T2**|21+|**T1** = tensor(float), tensor(float16) **T2** = tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int4), tensor(int8), tensor(uint16), tensor(uint4), tensor(uint8)|
+|||[19, 20]|**T1** = tensor(float), tensor(float16) **T2** = tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int8), tensor(uint8)|
|||[13, 18]|**T1** = tensor(float) **T2** = tensor(int8), tensor(uint8)|
|||[10, 12]|**T1** = tensor(float) **T2** = tensor(int8), tensor(uint8)|
|RNN|*in* X:**T** *in* W:**T** *in* R:**T** *in* B:**T** *in* sequence_lens:**T1** *in* initial_h:**T** *out* Y:**T** *out* Y_h:**T**|14+|**T** = tensor(float) **T1** = tensor(int32)|
@@ -277,7 +288,8 @@ Do not modify directly.*
|||[13, 17]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
|||[11, 12]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
|||[1, 10]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
-|ReduceMax|*in* data:**T** *in* axes:**tensor(int64)** *out* reduced:**T** or *in* data:**T** *out* reduced:**T**|18+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
+|ReduceMax|*in* data:**T** *in* axes:**tensor(int64)** *out* reduced:**T** or *in* data:**T** *out* reduced:**T**|20+|**T** = tensor(bool), tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
+|||[18, 19]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
|||[13, 17]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
|||12|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
|||11|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
@@ -286,7 +298,8 @@ Do not modify directly.*
|||[13, 17]|**T** = tensor(double), tensor(float), tensor(int32)|
|||[11, 12]|**T** = tensor(double), tensor(float), tensor(int32)|
|||[1, 10]|**T** = tensor(double), tensor(float), tensor(int32)|
-|ReduceMin|*in* data:**T** *in* axes:**tensor(int64)** *out* reduced:**T** or *in* data:**T** *out* reduced:**T**|18+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
+|ReduceMin|*in* data:**T** *in* axes:**tensor(int64)** *out* reduced:**T** or *in* data:**T** *out* reduced:**T**|20+|**T** = tensor(bool), tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
+|||[18, 19]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
|||[13, 17]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
|||12|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
|||11|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
@@ -302,10 +315,12 @@ Do not modify directly.*
|||[13, 17]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
|||[11, 12]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
|||[1, 10]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
+|RegexFullMatch|*in* X:**T1** *out* Y:**T2**|20+|**T1** = tensor(string) **T2** = tensor(bool)|
|Relu|*in* X:**T** *out* Y:**T**|14+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int8)|
|||13|**T** = tensor(double), tensor(float)|
|||[6, 12]|**T** = tensor(double), tensor(float)|
-|Reshape|*in* data:**T** *in* shape:**tensor(int64)** *out* reshaped:**T** or *in* data:**T** *out* reshaped:**T**|19+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **shape** = tensor(int64)|
+|Reshape|*in* data:**T** *in* shape:**tensor(int64)** *out* reshaped:**T** or *in* data:**T** *out* reshaped:**T**|21+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **shape** = tensor(int64)|
+|||[19, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **shape** = tensor(int64)|
|||[14, 18]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **shape** = tensor(int64)|
|||13|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **shape** = tensor(int64)|
|||[5, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **shape** = tensor(int64)|
@@ -322,7 +337,8 @@ Do not modify directly.*
|STFT|*in* signal:**T1** *in* frame_step:**T2** *in* window:**T1** *in* frame_length:**T2** *out* output:**T1**|17+|**T1** = tensor(double), tensor(float) **T2** = tensor(int32), tensor(int64)|
|Scale|*in* input:**T** *out* output:**T**|1+|**T** = tensor(float)|
|ScaledTanh|*in* input:**T** *out* output:**T**|1+|**T** = tensor(float)|
-|Scan|*in* initial_state_and_scan_inputs:**V** *out* final_state_and_scan_outputs:**V** or *in* sequence_lens:**I** *in* initial_state_and_scan_inputs:**V** *out* final_state_and_scan_outputs:**V**|19+|**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Scan|*in* initial_state_and_scan_inputs:**V** *out* final_state_and_scan_outputs:**V** or *in* sequence_lens:**I** *in* initial_state_and_scan_inputs:**V** *out* final_state_and_scan_outputs:**V**|21+|**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[19, 20]|**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[16, 18]|**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[11, 15]|**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[9, 10]|**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -343,7 +359,8 @@ Do not modify directly.*
|SequenceErase|*in* input_sequence:**S** *in* position:**I** *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64) **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
|SequenceInsert|*in* input_sequence:**S** *in* tensor:**T** *in* position:**I** *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64) **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
|SequenceLength|*in* input_sequence:**S** *out* length:**I**|11+|**I** = tensor(int64) **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
-|Shape|*in* data:**T** *out* shape:**T1**|19+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
+|Shape|*in* data:**T** *out* shape:**T1**|21+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
+|||[19, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
|||[15, 18]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
|||[13, 14]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
|||[1, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
@@ -355,7 +372,8 @@ Do not modify directly.*
|SimplifiedLayerNormalization|*in* X:**T** *in* scale:**V** *out* Y:**V** *out* inv_std_var:**U**|1+|**T** = tensor(double), tensor(float) **U** = tensor(double), tensor(float) **V** = tensor(double), tensor(float)|
|Sin|*in* input:**T** *out* output:**T**|7+|**T** = tensor(double), tensor(float)|
|Sinh|*in* input:**T** *out* output:**T**|9+|**T** = tensor(float)|
-|Size|*in* data:**T** *out* size:**T1**|19+|**T** = tensor(bool), tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
+|Size|*in* data:**T** *out* size:**T1**|21+|**T** = tensor(bool), tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
+|||[19, 20]|**T** = tensor(bool), tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
|||[13, 18]|**T** = tensor(bool), tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
|||[1, 12]|**T** = tensor(bool), tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
|Slice|*in* data:**T** *in* starts:**Tind** *in* ends:**Tind** *in* axes:**Tind** *in* steps:**Tind** *out* output:**T** or *in* data:**T** *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **Tind** = tensor(int32), tensor(int64)|
@@ -376,10 +394,13 @@ Do not modify directly.*
|SplitToSequence|*in* input:**T** *in* split:**I** *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64) **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)) **T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(string)|
|Sqrt|*in* X:**T** *out* Y:**T**|13+|**T** = tensor(double), tensor(float)|
|||[6, 12]|**T** = tensor(double), tensor(float)|
-|Squeeze|*in* data:**T** *in* axes:**tensor(int64)** *out* squeezed:**T** or *in* data:**T** *out* squeezed:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Squeeze|*in* data:**T** *in* axes:**tensor(int64)** *out* squeezed:**T** or *in* data:**T** *out* squeezed:**T**|21+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[13, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[1, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|StringConcat|*in* X:**T** *in* Y:**T** *out* Z:**T**|20+|**T** = tensor(string)|
|StringNormalizer|*in* X:**tensor(string)** *out* Y:**tensor(string)**|10+|**X** = tensor(string)|
+|StringSplit|*in* X:**T1** *out* Y:**T2** *out* Z:**T3**|20+|**T1** = tensor(string) **T2** = tensor(string) **T3** = tensor(int64)|
|Sub|*in* A:**T** *in* B:**T** *out* C:**T**|14+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
|||13|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
|||[7, 12]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
@@ -397,11 +418,13 @@ Do not modify directly.*
|TopK|*in* X:**T** *in* K:**tensor(int64)** *out* Values:**T** *out* Indices:**I** or *in* X:**T** *out* Values:**T** *out* Indices:**I**|11+|**I** = tensor(int64) **T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
|||10|**I** = tensor(int64) **T** = tensor(double), tensor(float)|
|||[1, 9]|**I** = tensor(int64) **T** = tensor(double), tensor(float)|
-|Transpose|*in* data:**T** *out* transposed:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Transpose|*in* data:**T** *out* transposed:**T**|21+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)|
+|||[13, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[1, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|Trilu|*in* input:**T** *in* k:**tensor(int64)** *out* output:**T**|14+|**T** = tensor(double), tensor(float), tensor(int64)|
|Unique|*in* X:**T** *out* Y:**T** *out* indices:**tensor(int64)** *out* inverse_indices:**tensor(int64)** *out* counts:**tensor(int64)**|11+|**T** = tensor(double), tensor(float), tensor(int64), tensor(int8), tensor(string)|
-|Unsqueeze|*in* data:**T** *in* axes:**tensor(int64)** *out* expanded:**T** or *in* data:**T** *out* expanded:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Unsqueeze|*in* data:**T** *in* axes:**tensor(int64)** *out* expanded:**T** or *in* data:**T** *out* expanded:**T**|21+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[13, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[1, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|Upsample|*in* X:**T** *in* scales:**tensor(float)** *out* Y:**T** or *in* X:**T** *out* Y:**T**|9|**T** = tensor(float), tensor(int32), tensor(int8), tensor(uint8)|
@@ -419,7 +442,8 @@ Do not modify directly.*
|DictVectorizer|*in* X:**T1** *out* Y:**T2**|1+|**T1** = map(int64,tensor(double)), map(int64,tensor(float)), map(int64,tensor(string)), map(string,tensor(double)), map(string,tensor(float)), map(string,tensor(int64)) **T2** = tensor(double), tensor(float), tensor(int64), tensor(string)|
|FeatureVectorizer|*in* X:**T1** *out* Y:**tensor(float)**|1+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
|Imputer|*in* X:**T** *out* Y:**T**|1+|**T** = tensor(float), tensor(int64)|
-|LabelEncoder|*in* X:**T1** *out* Y:**T2**|2+|**T1** = tensor(float), tensor(int64), tensor(string) **T2** = tensor(float), tensor(int64), tensor(string)|
+|LabelEncoder|*in* X:**T1** *out* Y:**T2**|4+|**T1** = tensor(double), tensor(float), tensor(int64), tensor(string) **T2** = tensor(double), tensor(float), tensor(int16), tensor(int64), tensor(string)|
+|||[2, 3]|**T1** = tensor(float), tensor(int64), tensor(string) **T2** = tensor(float), tensor(int64), tensor(string)|
|||1|**T1** = tensor(int64), tensor(string) **T2** = tensor(int64), tensor(string)|
|LinearClassifier|*in* X:**T1** *out* Y:**T2** *out* Z:**tensor(float)**|1+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64) **T2** = tensor(int64), tensor(string)|
|LinearRegressor|*in* X:**T** *out* Y:**tensor(float)**|1+|**T** = tensor(float)|
@@ -444,7 +468,7 @@ Do not modify directly.*
|CDist|*in* A:**T** *in* B:**T** *out* C:**T**|1+|**T** = tensor(double), tensor(float)|
|ConvTransposeWithDynamicPads|*in* X:**T** *in* W:**T** *in* Pads:**tensor(int64)** *in* B:**T** *out* Y:**T**|1+|**T** = tensor(float)|
|CropAndResize|*in* X:**T1** *in* rois:**T1** *in* batch_indices:**T2** *in* crop_size:**T2** *out* Y:**T1**|1+|**T1** = tensor(float) **T2** = tensor(int32)|
-|DequantizeLinear|*in* x:**T1** *in* x_scale:**T2** *in* x_zero_point:**T1** *out* y:**T2**|1+|**T1** = tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint8) **T2** = tensor(float)|
+|DequantizeLinear|*in* x:**T1** *in* x_scale:**T2** *in* x_zero_point:**T1** *out* y:**T2**|1+|**T1** = tensor(int16), tensor(int32), tensor(int4), tensor(int8), tensor(uint16), tensor(uint4), tensor(uint8) **T2** = tensor(float)|
|DynamicQuantizeLSTM|*in* X:**T** *in* W:**T2** *in* R:**T2** *in* B:**T** *in* sequence_lens:**T1** *in* initial_h:**T** *in* initial_c:**T** *in* P:**T** *in* W_scale:**T** *in* W_zero_point:**T2** *in* R_scale:**T** *in* R_zero_point:**T2** *out* Y:**T** *out* Y_h:**T** *out* Y_c:**T**|1+|**T** = tensor(float) **T1** = tensor(int32) **T2** = tensor(int8), tensor(uint8)|
|DynamicQuantizeMatMul|*in* A:**T1** *in* B:**T2** *in* b_scale:**T1** *in* b_zero_point:**T2** *in* bias:**T1** *out* Y:**T1**|1+|**T1** = tensor(float) **T2** = tensor(int8), tensor(uint8)|
|EmbedLayerNormalization|*in* input_ids:**T1** *in* segment_ids:**T1** *in* word_embedding:**T** *in* position_embedding:**T** *in* segment_embedding:**T** *in* gamma:**T** *in* beta:**T** *in* mask:**T1** *in* position_ids:**T1** *out* output:**T** *out* mask_index:**T1** *out* embedding_sum:**T**|1+|**T** = tensor(float)|
@@ -457,12 +481,13 @@ Do not modify directly.*
|Gelu|*in* X:**T** *out* Y:**T**|1+|**T** = tensor(float)|
|GreedySearch|*in* input_ids:**I** *in* max_length:**I** *in* min_length:**I** *in* repetition_penalty:**T** *in* vocab_mask:**I** *in* prefix_vocab_mask:**I** *in* attention_mask:**I** *out* sequences:**I**|1+|**T** = tensor(float)|
|GridSample|*in* X:**T1** *in* Grid:**T1** *out* Y:**T2**|1+|**T1** = tensor(float) **T2** = tensor(float)|
+|GroupQueryAttention|*in* query:**T** *in* key:**T** *in* value:**T** *in* past_key:**T** *in* past_value:**T** *in* seqlens_k:**M** *in* total_sequence_length:**M** *in* cos_cache:**T** *in* sin_cache:**T** *out* output:**T** *out* present_key:**T** *out* present_value:**T**|1+|**M** = tensor(int32) **T** = tensor(float)|
|Inverse|*in* X:**T** *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
|MatMulBnb4|*in* A:**T1** *in* B:**T2** *in* absmax:**T1** *out* Y:**T1**|1+|**T1** = tensor(float) **T2** = tensor(uint8)|
|MatMulFpQ4|*in* A:**T1** *in* B:**T2** *in* B_shape:**T3** *out* Y:**T1**|1+|**T1** = tensor(float) **T2** = tensor(uint8) **T3** = tensor(int64)|
|MatMulInteger16|*in* A:**T1** *in* B:**T2** *out* Y:**T3**|1+|**T1** = tensor(int16) **T2** = tensor(int16) **T3** = tensor(int32)|
|MatMulIntegerToFloat|*in* A:**T1** *in* B:**T2** *in* a_scale:**T3** *in* b_scale:**T3** *in* a_zero_point:**T1** *in* b_zero_point:**T2** *in* bias:**T3** *out* Y:**T3**|1+|**T1** = tensor(int8), tensor(uint8) **T2** = tensor(int8), tensor(uint8) **T3** = tensor(float)|
-|MatMulNBits|*in* A:**T1** *in* B:**T2** *in* scales:**T1** *in* zero_points:**T2** *out* Y:**T1**|1+|**T1** = tensor(float) **T2** = tensor(uint8)|
+|MatMulNBits|*in* A:**T1** *in* B:**T2** *in* scales:**T1** *in* zero_points:**T3** *in* g_idx:**T4** *in* bias:**T1** *out* Y:**T1**|1+|**T1** = tensor(float) **T2** = tensor(uint8) **T3** = tensor(float), tensor(uint8) **T4** = tensor(int32)|
|MaxpoolWithMask|*in* X:**T** *in* M:**tensor(int32)** *out* Y:**T**|1+|**T** = tensor(float)|
|MultiHeadAttention|*in* query:**T** *in* key:**T** *in* value:**T** *in* bias:**T** *in* key_padding_mask:**M** *in* relative_position_bias:**T** *in* past_key:**T** *in* past_value:**T** *out* output:**T** *out* present_key:**T** *out* present_value:**T**|1+|**T** = tensor(float)|
|MurmurHash3|*in* X:**T1** *out* Y:**T2**|1+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(string), tensor(uint32), tensor(uint64) **T2** = tensor(int32), tensor(uint32)|
@@ -479,7 +504,7 @@ Do not modify directly.*
|QLinearSigmoid|*in* X:**T** *in* X_scale:**tensor(float)** *in* X_zero_point:**T** *in* Y_scale:**tensor(float)** *in* Y_zero_point:**T** *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
|QLinearSoftmax|*in* X:**T** *in* X_scale:**tensor(float)** *in* x_zero_point:**T** *in* y_scale:**tensor(float)** *in* y_zero_point:**T** *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
|QLinearWhere|*in* condition:**B** *in* X:**T** *in* x_scale:**TF** *in* x_zero_point:**T** *in* Y:**T** *in* y_scale:**TF** *in* y_zero_point:**T** *in* z_scale:**TF** *in* z_zero_point:**T** *out* Z:**T**|1+|**T** = tensor(int8), tensor(uint8)|
-|QuantizeLinear|*in* x:**T1** *in* y_scale:**T1** *in* y_zero_point:**T2** *out* y:**T2**|1+|**T1** = tensor(float) **T2** = tensor(int16), tensor(int8), tensor(uint16), tensor(uint8)|
+|QuantizeLinear|*in* x:**T1** *in* y_scale:**T1** *in* y_zero_point:**T2** *out* y:**T2**|1+|**T1** = tensor(float) **T2** = tensor(int16), tensor(int4), tensor(int8), tensor(uint16), tensor(uint4), tensor(uint8)|
|QuickGelu|*in* X:**T** *out* Y:**T**|1+|**T** = tensor(float)|
|Range|*in* start:**T** *in* limit:**T** *in* delta:**T** *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64)|
|RotaryEmbedding|*in* input:**T** *in* position_ids:**M** *in* cos_cache:**T** *in* sin_cache:**T** *out* output:**T**|1+|**M** = tensor(int64) **T** = tensor(float)|
@@ -492,7 +517,7 @@ Do not modify directly.*
|TransposeMatMul|*in* A:**T** *in* B:**T** *out* Y:**T**|1+|**T** = tensor(float)|
|Trilu|*in* X:**T** *in* k:**tensor(int64)** *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(int64)|
|Unique|*in* x:**T** *out* y:**T** *out* idx:**tensor(int64)** *out* counts:**tensor(int64)**|1+|**T** = tensor(float)|
-|WhisperBeamSearch|*in* input_ids:**F** *in* max_length:**I** *in* min_length:**I** *in* num_beams:**I** *in* num_return_sequences:**I** *in* length_penalty:**T** *in* repetition_penalty:**T** *in* vocab_mask:**M** *in* prefix_vocab_mask:**M** *in* attention_mask:**I** *in* decoder_input_ids:**I** *in* logits_processor:**I** *in* cross_qk_layer_head:**I** *in* extra_decoding_ids:**I** *out* sequences:**I** *out* sequences_scores:**T** *out* scores:**T** *out* cross_qk:**V** *out* non_speech_probs:**T**|1+|**T** = tensor(float)|
+|WhisperBeamSearch|*in* input_ids:**F** *in* max_length:**I** *in* min_length:**I** *in* num_beams:**I** *in* num_return_sequences:**I** *in* length_penalty:**T** *in* repetition_penalty:**T** *in* vocab_mask:**M** *in* prefix_vocab_mask:**M** *in* attention_mask:**I** *in* decoder_input_ids:**I** *in* logits_processor:**I** *in* cross_qk_layer_head:**I** *in* extra_decoding_ids:**I** *in* temperature:**T** *out* sequences:**I** *out* sequences_scores:**T** *out* scores:**T** *out* cross_qk:**V** *out* non_speech_probs:**T**|1+|**T** = tensor(float)|
|WordConvEmbedding|*in* Sequence:**T** *in* W:**T1** *in* B:**T1** *in* C:**T1** *out* Y:**T1**|1+|**T** = tensor(int32) **T1** = tensor(float)|
| |
| |
@@ -576,9 +601,9 @@ Do not modify directly.*
|Equal|*in* A:**T** *in* B:**T** *out* C:**T1**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64) **T1** = tensor(bool)|
|||[11, 12]|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
|||[7, 10]|**T** = tensor(bool), tensor(int32), tensor(int64)|
-|Erf|*in* input:**T** *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
+|Erf|*in* input:**T** *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
|||[9, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
-|Exp|*in* input:**T** *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
+|Exp|*in* input:**T** *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
|||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
|Expand|*in* input:**T** *in* shape:**tensor(int64)** *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[8, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -599,6 +624,7 @@ Do not modify directly.*
|GatherND|*in* data:**T** *in* indices:**tensor(int64)** *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int64) **indices** = tensor(int64)|
|||12|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int64) **indices** = tensor(int64)|
|||11|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int64) **indices** = tensor(int64)|
+|Gelu|*in* X:**T** *out* Y:**T**|20+|**T** = tensor(double), tensor(float), tensor(float16)|
|Gemm|*in* A:**T** *in* B:**T** *in* C:**T** *out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
|||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
|||[9, 10]|**T** = tensor(double), tensor(float), tensor(float16)|
@@ -610,6 +636,7 @@ Do not modify directly.*
|||[7, 8]|**T** = tensor(double), tensor(float), tensor(float16)|
|GreaterOrEqual|*in* A:**T** *in* B:**T** *out* C:**T1**|16+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64) **T1** = tensor(bool)|
|||[12, 15]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64) **T1** = tensor(bool)|
+|GridSample|*in* X:**T1** *in* grid:**T2** *out* Y:**T1**|16+|**T1** = tensor(float) **T2** = tensor(float)|
|HardSigmoid|*in* X:**T** *out* Y:**T**|6+|**T** = tensor(double), tensor(float), tensor(float16)|
|Identity|*in* input:**T** *out* output:**T** or *in* input:**V** *out* output:**V**|19+|**V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[14, 18]|**V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -621,6 +648,11 @@ Do not modify directly.*
|||[1, 10]|**B** = tensor(bool) **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|ImageScaler|*in* input:**T** *out* output:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
|InstanceNormalization|*in* input:**T** *in* scale:**T** *in* B:**T** *out* output:**T**|6+|**T** = tensor(double), tensor(float), tensor(float16)|
+|IsInf|*in* X:**T1** *out* Y:**T2**|20+|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz) **T2** = tensor(bool)|
+|||[10, 19]|**T1** = tensor(double), tensor(float) **T2** = tensor(bool)|
+|IsNaN|*in* X:**T1** *out* Y:**T2**|20+|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz) **T2** = tensor(bool)|
+|||[13, 19]|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16) **T2** = tensor(bool)|
+|||[9, 12]|**T1** = tensor(double), tensor(float), tensor(float16) **T2** = tensor(bool)|
|LRN|*in* X:**T** *out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
|||[1, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
|LSTM|*in* X:**T** *in* W:**T** *in* R:**T** *in* B:**T** *in* sequence_lens:**T1** *in* initial_h:**T** *in* initial_c:**T** *in* P:**T** *out* Y:**T** *out* Y_h:**T** *out* Y_c:**T**|14+|**T** = tensor(double), tensor(float), tensor(float16) **T1** = tensor(int32)|
@@ -675,7 +707,8 @@ Do not modify directly.*
|PRelu|*in* X:**T** *in* slope:**T** *out* Y:**T**|16+|**T** = tensor(double), tensor(float), tensor(float16)|
|||[9, 15]|**T** = tensor(double), tensor(float), tensor(float16)|
|||[7, 8]|**T** = tensor(double), tensor(float), tensor(float16)|
-|Pad|*in* data:**T** *in* pads:**tensor(int64)** *in* constant_value:**T** *in* axes:**Tind** *out* output:**T** or *in* data:**T** *in* pads:**tensor(int64)** *in* constant_value:**T** *out* output:**T** or *in* data:**T** *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
+|Pad|*in* data:**T** *in* pads:**tensor(int64)** *in* constant_value:**T** *in* axes:**Tind** *out* output:**T** or *in* data:**T** *in* pads:**tensor(int64)** *in* constant_value:**T** *out* output:**T** or *in* data:**T** *out* output:**T**|18+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
+|||[13, 17]|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
|||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
|||[2, 10]|**T** = tensor(double), tensor(float), tensor(float16)|
|ParametricSoftplus|*in* X:**T** *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
@@ -723,7 +756,8 @@ Do not modify directly.*
|||13|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **shape** = tensor(int64)|
|||[5, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **shape** = tensor(int64)|
|||[1, 4]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Resize|*in* X:**T** *in* scales:**tensor(float)** *out* Y:**T** or *in* X:**T1** *in* roi:**T2** *in* scales:**tensor(float)** *in* sizes:**tensor(int64)** *out* Y:**T1**|13+|**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
+|Resize|*in* X:**T** *in* scales:**tensor(float)** *out* Y:**T** or *in* X:**T1** *in* roi:**T2** *in* scales:**tensor(float)** *in* sizes:**tensor(int64)** *out* Y:**T1**|18+|**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
+|||[13, 17]|**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
|||[11, 12]|**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
|||10|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
|ReverseSequence|*in* input:**T** *in* sequence_lens:**tensor(int64)** *out* Y:**T**|10+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -736,9 +770,13 @@ Do not modify directly.*
|||[9, 10]|**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||8|**I** = tensor(int64) **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|Scatter|*in* data:**T** *in* indices:**Tind** *in* updates:**T** *out* output:**T**|[9, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **Tind** = tensor(int32), tensor(int64)|
-|ScatterElements|*in* data:**T** *in* indices:**Tind** *in* updates:**T** *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **Tind** = tensor(int32), tensor(int64)|
+|ScatterElements|*in* data:**T** *in* indices:**Tind** *in* updates:**T** *out* output:**T**|18+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **Tind** = tensor(int32), tensor(int64)|
+|||[16, 17]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **Tind** = tensor(int32), tensor(int64)|
+|||[13, 15]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **Tind** = tensor(int32), tensor(int64)|
|||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **Tind** = tensor(int32), tensor(int64)|
-|ScatterND|*in* data:**T** *in* indices:**tensor(int64)** *in* updates:**T** *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|ScatterND|*in* data:**T** *in* indices:**tensor(int64)** *in* updates:**T** *out* output:**T**|18+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[16, 17]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[13, 15]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|Selu|*in* X:**T** *out* Y:**T**|6+|**T** = tensor(double), tensor(float), tensor(float16)|
|SequenceAt|*in* input_sequence:**S** *in* position:**I** *out* tensor:**T**|11+|**I** = tensor(int32), tensor(int64) **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)) **T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -755,7 +793,7 @@ Do not modify directly.*
|Sigmoid|*in* X:**T** *out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
|||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
|Sign|*in* input:**T** *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|SimplifiedLayerNormalization|*in* X:**T** *in* scale:**V** *out* Y:**V** *out* inv_std_var:**U**|1+|**T** = tensor(double), tensor(float), tensor(float16) **U** = tensor(double), tensor(float) **V** = tensor(double), tensor(float), tensor(float16)|
+|SimplifiedLayerNormalization|*in* X:**T** *in* scale:**V** *out* Y:**V** *out* inv_std_var:**U**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16) **U** = tensor(double), tensor(float) **V** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
|Sin|*in* input:**T** *out* output:**T**|7+|**T** = tensor(double), tensor(float), tensor(float16)|
|Size|*in* data:**T** *out* size:**T1**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
|||[1, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
@@ -774,7 +812,7 @@ Do not modify directly.*
|||[13, 17]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[2, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Sqrt|*in* X:**T** *out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
+|Sqrt|*in* X:**T** *out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
|||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
|Squeeze|*in* data:**T** *in* axes:**tensor(int64)** *out* squeezed:**T** or *in* data:**T** *out* squeezed:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -833,22 +871,24 @@ Do not modify directly.*
|GatedRelativePositionBias|*in* query_layer:**T** *in* query_bias:**T** *in* rel_pos:**T** *in* weight:**T** *in* bias:**T** *in* eco_a:**T** *in* token_offset:**M** *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
|Gelu|*in* X:**T** *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
|GemmFloat8|*in* A:**TA** *in* B:**TB** *in* C:**TC** *in* scaleA:**TS** *in* scaleB:**TS** *in* scaleY:**TS** *out* Y:**TR**|1+|**TA** = tensor(bfloat16), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e5m2) **TB** = tensor(bfloat16), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e5m2) **TR** = tensor(bfloat16), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e5m2) **TS** = tensor(float)|
+|GemmaRotaryEmbedding|*in* emb:**U** *in* q:**T** *in* q_rot:**T** *in* k:**T** *in* k_rot:**T** *out* output1:**T** *out* output2:**T**|1+|**T** = tensor(float16) **U** = tensor(float)|
|GreedySearch|*in* input_ids:**I** *in* max_length:**I** *in* min_length:**I** *in* repetition_penalty:**T** *in* vocab_mask:**I** *in* prefix_vocab_mask:**I** *in* attention_mask:**I** *out* sequences:**I**|1+|**T** = tensor(float), tensor(float16)|
|GridSample|*in* X:**T1** *in* Grid:**T1** *out* Y:**T2**|1+|**T1** = tensor(float) **T2** = tensor(float)|
|GroupNorm|*in* X:**T** *in* gamma:**M** *in* beta:**M** *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|GroupQueryAttention|*in* query:**T** *in* key:**T** *in* value:**T** *in* past_key:**T** *in* past_value:**T** *in* seqlens_k:**M** *in* total_sequence_length:**M** *out* output:**T** *out* present_key:**T** *out* present_value:**T**|1+|**M** = tensor(int32) **T** = tensor(float16)|
+|GroupQueryAttention|*in* query:**T** *in* key:**T** *in* value:**T** *in* past_key:**T** *in* past_value:**T** *in* seqlens_k:**M** *in* total_sequence_length:**M** *in* cos_cache:**T** *in* sin_cache:**T** *out* output:**T** *out* present_key:**T** *out* present_value:**T**|1+|**M** = tensor(int32) **T** = tensor(bfloat16), tensor(float16)|
|Inverse|*in* X:**T** *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
|Irfft|*in* X:**T** *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
|LongformerAttention|*in* input:**T** *in* weight:**T** *in* bias:**T** *in* mask:**T** *in* global_weight:**T** *in* global_bias:**T** *in* global:**G** *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
|MatMulBnb4|*in* A:**T1** *in* B:**T2** *in* absmax:**T1** *out* Y:**T1**|1+|**T1** = tensor(bfloat16), tensor(float), tensor(float16) **T2** = tensor(uint8)|
-|MatMulNBits|*in* A:**T1** *in* B:**T2** *in* scales:**T1** *in* zero_points:**T2** *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16) **T2** = tensor(uint8)|
-|MoE|*in* input:**T** *in* router_probs:**T** *in* fc1_experts_weights:**T** *in* fc2_experts_weights:**T** *in* fc1_experts_bias:**T** *in* fc2_experts_bias:**T** *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
+|MatMulNBits|*in* A:**T1** *in* B:**T2** *in* scales:**T1** *in* zero_points:**T3** *in* g_idx:**T4** *in* bias:**T1** *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16) **T2** = tensor(uint8)|
+|MoE|*in* input:**T** *in* router_probs:**T** *in* fc1_experts_weights:**T** *in* fc1_experts_bias:**T** *in* fc2_experts_weights:**T** *in* fc2_experts_bias:**T** *in* fc3_experts_weights:**T** *in* fc3_experts_bias:**T** *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
|MultiHeadAttention|*in* query:**T** *in* key:**T** *in* value:**T** *in* bias:**T** *in* key_padding_mask:**M** *in* relative_position_bias:**T** *in* past_key:**T** *in* past_value:**T** *out* output:**T** *out* present_key:**T** *out* present_value:**T**|1+|**T** = tensor(float), tensor(float16)|
|NGramRepeatBlock|*in* input_ids:**Tid** *in* scores:**T** *out* scores_out:**T**|1+|**T** = tensor(float) **Tid** = tensor(int64)|
|NhwcConv|*in* X:**T** *in* W:**T** *in* B:**T** *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
|PackedAttention|*in* input:**T** *in* weights:**T** *in* bias:**T** *in* token_offset:**M** *in* cumulative_sequence_length:**M** *in* relative_position_bias:**T** *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
|PackedMultiHeadAttention|*in* query:**T** *in* key:**T** *in* value:**T** *in* bias:**T** *in* token_offset:**M** *in* cumulative_sequence_length:**M** *in* relative_position_bias:**T** *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
|QAttention|*in* input:**T1** *in* weight:**T2** *in* bias:**T3** *in* input_scale:**T3** *in* weight_scale:**T3** *in* mask_index:**T4** *in* input_zero_point:**T1** *in* weight_zero_point:**T2** *in* past:**T3** *out* output:**T3** *out* present:**T3**|1+|**T1** = tensor(int8) **T2** = tensor(int8) **T3** = tensor(float), tensor(float16) **T4** = tensor(int32)|
+|QMoE|*in* input:**T** *in* router_probs:**T** *in* fc1_experts_weights:**T1** *in* fc1_scales:**T** *in* fc1_experts_bias:**T** *in* fc2_experts_weights:**T1** *in* fc2_scales:**T** *in* fc2_experts_bias:**T** *in* fc3_experts_weights:**T1** *in* fc3_scales:**T** *in* fc3_experts_bias:**T** *out* output:**T**|1+|**T** = tensor(float16) **T1** = tensor(uint8)|
|QOrderedAttention|*in* input:**Q** *in* scale_input:**S** *in* scale_Q_gemm:**S** *in* scale_K_gemm:**S** *in* scale_V_gemm:**S** *in* Q_weight:**Q** *in* K_weight:**Q** *in* V_weight:**Q** *in* scale_Q_weight:**S** *in* scale_K_weight:**S** *in* scale_V_weight:**S** *in* Q_bias:**S** *in* K_bias:**S** *in* V_bias:**S** *in* scale_QKT_gemm:**S** *in* scale_QKT_softmax:**S** *in* scale_values_gemm:**S** *in* mask_index:**G** *in* past:**Q** *in* relative_position_bias:**S** *out* output:**Q**|1+|**G** = tensor(int32) **Q** = tensor(int8) **S** = tensor(float)|
|QOrderedGelu|*in* X:**Q** *in* scale_X:**S** *in* scale_Y:**S** *out* Y:**Q**|1+|**Q** = tensor(int8) **S** = tensor(float)|
|QOrderedLayerNormalization|*in* X:**Q** *in* scale_X:**S** *in* scale:**F** *in* B:**F** *in* scale_Y:**S** *out* Y:**Q**|1+|**F** = tensor(float), tensor(float16) **Q** = tensor(int8) **S** = tensor(float)|
@@ -861,15 +901,16 @@ Do not modify directly.*
|RemovePadding|*in* input:**T** *in* sequence_token_count:**M** *out* output:**T** *out* token_offset:**M** *out* cumulated_seq_len:**M** *out* max_seq_len:**M**|1+|**T** = tensor(float), tensor(float16)|
|RestorePadding|*in* input:**T** *in* token_offset:**M** *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
|Rfft|*in* X:**T** *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
-|RotaryEmbedding|*in* input:**T** *in* position_ids:**M** *in* cos_cache:**T** *in* sin_cache:**T** *out* output:**T**|1+|**M** = tensor(int64) **T** = tensor(float), tensor(float16)|
+|RotaryEmbedding|*in* input:**T** *in* position_ids:**M** *in* cos_cache:**T** *in* sin_cache:**T** *out* output:**T**|1+|**M** = tensor(int64) **T** = tensor(bfloat16), tensor(float), tensor(float16)|
|Sampling|*in* input_ids:**I** *in* max_length:**I** *in* min_length:**I** *in* repetition_penalty:**T** *in* vocab_mask:**I** *in* prefix_vocab_mask:**I** *in* attention_mask:**I** *in* presence_mask:**I** *in* seed:**I** *out* sequences:**I** *out* filtered_logits:**T**|1+|**T** = tensor(float), tensor(float16)|
|SkipGroupNorm|*in* X:**T** *in* gamma:**M** *in* beta:**M** *in* skip:**T** *in* bias:**T** *out* Y:**T** *out* S:**T**|1+|**T** = tensor(float), tensor(float16)|
|SkipLayerNormalization|*in* input:**T** *in* skip:**T** *in* gamma:**T** *in* beta:**T** *in* bias:**T** *out* output:**T** *out* mean:**U** *out* inv_std_var:**U** *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
|SkipSimplifiedLayerNormalization|*in* input:**T** *in* skip:**T** *in* gamma:**T** *in* bias:**T** *out* output:**T** *out* mean:**U** *out* inv_std_var:**U** *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
+|SparseAttention|*in* query:**T** *in* key:**T** *in* value:**T** *in* past_key:**T** *in* past_value:**T** *in* block_row_indices:**M** *in* block_col_indices:**M** *in* total_sequence_length:**M** *in* key_total_sequence_lengths:**M** *in* cos_cache:**T** *in* sin_cache:**T** *out* output:**T** *out* present_key:**T** *out* present_value:**T**|1+|**M** = tensor(int32) **T** = tensor(bfloat16), tensor(float16)|
|TransposeMatMul|*in* A:**T** *in* B:**T** *out* Y:**T**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
|Trilu|*in* X:**T** *in* k:**tensor(int64)** *out* Y:**T**|1+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|UnfoldTensor|*in* input:**T** *out* output:**T**|1+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|WhisperBeamSearch|*in* input_ids:**F** *in* max_length:**I** *in* min_length:**I** *in* num_beams:**I** *in* num_return_sequences:**I** *in* length_penalty:**T** *in* repetition_penalty:**T** *in* vocab_mask:**M** *in* prefix_vocab_mask:**M** *in* attention_mask:**I** *in* decoder_input_ids:**I** *in* logits_processor:**I** *in* cross_qk_layer_head:**I** *in* extra_decoding_ids:**I** *out* sequences:**I** *out* sequences_scores:**T** *out* scores:**T** *out* cross_qk:**V** *out* non_speech_probs:**T**|1+|**T** = tensor(float), tensor(float16)|
+|WhisperBeamSearch|*in* input_ids:**F** *in* max_length:**I** *in* min_length:**I** *in* num_beams:**I** *in* num_return_sequences:**I** *in* length_penalty:**T** *in* repetition_penalty:**T** *in* vocab_mask:**M** *in* prefix_vocab_mask:**M** *in* attention_mask:**I** *in* decoder_input_ids:**I** *in* logits_processor:**I** *in* cross_qk_layer_head:**I** *in* extra_decoding_ids:**I** *in* temperature:**T** *out* sequences:**I** *out* sequences_scores:**T** *out* scores:**T** *out* cross_qk:**V** *out* non_speech_probs:**T**|1+|**T** = tensor(float), tensor(float16)|
| |
| |
@@ -902,7 +943,8 @@ Do not modify directly.*
|Asinh|*in* input:**T** *out* output:**T**|9+|**T** = tensor(float), tensor(float16)|
|Atan|*in* input:**T** *out* output:**T**|7+|**T** = tensor(float), tensor(float16)|
|Atanh|*in* input:**T** *out* output:**T**|9+|**T** = tensor(float), tensor(float16)|
-|AveragePool|*in* X:**T** *out* Y:**T**|11+|**T** = tensor(float), tensor(float16)|
+|AveragePool|*in* X:**T** *out* Y:**T**|19+|**T** = tensor(float), tensor(float16)|
+|||11+|**T** = tensor(float), tensor(float16)|
|||10+|**T** = tensor(float), tensor(float16)|
|||7+|**T** = tensor(float), tensor(float16)|
|BatchNormalization|*in* X:**T** *in* scale:**T** *in* B:**T** *in* input_mean:**U** *in* input_var:**U** *out* Y:**T** *out* running_mean:**U** *out* running_var:**U** or *in* X:**T** *in* scale:**T** *in* B:**T** *in* mean:**T** *in* var:**T** *out* Y:**T** *out* mean:**T** *out* var:**T** *out* saved_mean:**T** *out* saved_var:**T** or *in* X:**T** *in* scale:**T1** *in* B:**T1** *in* input_mean:**T2** *in* input_var:**T2** *out* Y:**T** *out* running_mean:**T2** *out* running_var:**T2**|15+|**T** = tensor(float), tensor(float16)|
@@ -914,10 +956,12 @@ Do not modify directly.*
|BitwiseNot|*in* X:**T** *out* Y:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|BitwiseOr|*in* A:**T** *in* B:**T** *out* C:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|BitwiseXor|*in* A:**T** *in* B:**T** *out* C:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Cast|*in* input:**T1** *out* output:**T2**|13+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Cast|*in* input:**T1** *out* output:**T2**|19+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||13+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||9+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||6+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|CastLike|*in* input:**T1** *in* target_type:**T2** *out* output:**T2**|15+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|CastLike|*in* input:**T1** *in* target_type:**T2** *out* output:**T2**|19+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||15+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|Ceil|*in* X:**T** *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
|||6+|**T** = tensor(float), tensor(float16)|
|Celu|*in* X:**T** *out* Y:**T**|12+|**T** = tensor(float), tensor(float16)|
@@ -940,20 +984,23 @@ Do not modify directly.*
|Crop|*in* input:**T** *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
|CumSum|*in* x:**T** *in* axis:**T2** *out* y:**T**|14+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
|||11+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
-|DFT|*in* input:**T1** *in* dft_length:**T2** *in* axis:**tensor(int64)** *out* output:**T1** or *in* input:**T1** *in* dft_length:**T2** *out* output:**T1**|17+|**T1** = tensor(float), tensor(float16) **T2** = tensor(int64)|
+|DFT|*in* input:**T1** *in* dft_length:**T2** *in* axis:**tensor(int64)** *out* output:**T1** or *in* input:**T1** *in* dft_length:**T2** *out* output:**T1**|20+|**T1** = tensor(double), tensor(float), tensor(float16) **T2** = tensor(int32), tensor(int64)|
+|||17+|**T1** = tensor(double), tensor(float), tensor(float16) **T2** = tensor(int32), tensor(int64)|
|DepthToSpace|*in* input:**T** *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|DequantizeLinear|*in* x:**T** *in* x_scale:**tensor(float)** *in* x_zero_point:**T** *out* y:**tensor(float)** or *in* x:**T1** *in* x_scale:**T2** *in* x_zero_point:**T1** *out* y:**T2**|13+|**T** = tensor(int32), tensor(int8), tensor(uint8)|
+|DequantizeLinear|*in* x:**T** *in* x_scale:**tensor(float)** *in* x_zero_point:**T** *out* y:**tensor(float)** or *in* x:**T1** *in* x_scale:**T2** *in* x_zero_point:**T1** *out* y:**T2**|19+|**T1** = tensor(int32), tensor(int8), tensor(uint8) **T2** = tensor(float), tensor(float16)|
+|||13+|**T** = tensor(int32), tensor(int8), tensor(uint8)|
|||10+|**T** = tensor(int32), tensor(int8), tensor(uint8)|
|Div|*in* A:**T** *in* B:**T** *out* C:**T**|14+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||7+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|Dropout|*in* data:**T** *in* ratio:**T1** *in* training_mode:**T2** *out* output:**T** *out* mask:**T2** or *in* data:**T** *out* output:**T** *out* mask:**T** or *in* data:**T** *out* output:**T** *out* mask:**T1**|7+|**T** = tensor(float), tensor(float16)|
-|DynamicQuantizeLinear|*in* x:**T1** *out* y:**T2** *out* y_scale:**tensor(float)** *out* y_zero_point:**T2**|11+|**T1** = tensor(float) **T2** = tensor(uint8)|
+|DynamicQuantizeLinear|*in* x:**T1** *out* y:**T2** *out* y_scale:**tensor(float)** *out* y_zero_point:**T2**|11+|**T1** = tensor(float) **T2** = tensor(int8), tensor(uint8)|
|Einsum|*in* Inputs:**T** *out* Output:**T**|12+|**T** = tensor(float), tensor(float16)|
|Elu|*in* X:**T** *out* Y:**T**|6+|**T** = tensor(float), tensor(float16)|
-|Equal|*in* A:**T** *in* B:**T** *out* C:**T1**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(bool)|
+|Equal|*in* A:**T** *in* B:**T** *out* C:**T1**|19+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(bool)|
+|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(bool)|
|||11+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(bool)|
|||7+|**T** = tensor(float), tensor(float16) **T1** = tensor(bool)|
|Erf|*in* input:**T** *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
@@ -996,7 +1043,8 @@ Do not modify directly.*
|Hardmax|*in* input:**T** *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
|||11+|**T** = tensor(float), tensor(float16)|
|||1+|**T** = tensor(float), tensor(float16)|
-|Identity|*in* input:**T** *out* output:**T** or *in* input:**V** *out* output:**V**|16+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Identity|*in* input:**T** *out* output:**T** or *in* input:**V** *out* output:**V**|19+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||16+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||14+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -1007,15 +1055,17 @@ Do not modify directly.*
|||7+|**B** = tensor(bool) **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|ImageScaler|*in* input:**T** *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
|InstanceNormalization|*in* input:**T** *in* scale:**T** *in* B:**T** *out* output:**T**|6+|**T** = tensor(float), tensor(float16)|
-|IsInf|*in* X:**T1** *out* Y:**T2**|10+|**T1** = tensor(float) **T2** = tensor(bool)|
-|IsNaN|*in* X:**T1** *out* Y:**T2**|13+|**T1** = tensor(float), tensor(float16) **T2** = tensor(bool)|
+|IsInf|*in* X:**T1** *out* Y:**T2**|20+|**T1** = tensor(float) **T2** = tensor(bool)|
+|||10+|**T1** = tensor(float) **T2** = tensor(bool)|
+|IsNaN|*in* X:**T1** *out* Y:**T2**|20+|**T1** = tensor(float), tensor(float16) **T2** = tensor(bool)|
+|||13+|**T1** = tensor(float), tensor(float16) **T2** = tensor(bool)|
|||9+|**T1** = tensor(float), tensor(float16) **T2** = tensor(bool)|
|LRN|*in* X:**T** *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
|||1+|**T** = tensor(float), tensor(float16)|
|LSTM|*in* X:**T** *in* W:**T** *in* R:**T** *in* B:**T** *in* sequence_lens:**T1** *in* initial_h:**T** *in* initial_c:**T** *in* P:**T** *out* Y:**T** *out* Y_h:**T** *out* Y_c:**T**|14+|**T** = tensor(float), tensor(float16)|
|||7+|**T** = tensor(float), tensor(float16)|
|LayerNormalization|*in* X:**T** *in* Scale:**T** *in* B:**T** *out* Y:**T** *out* Mean:**U** *out* InvStdDev:**U** or *in* X:**T** *in* Scale:**V** *in* B:**V** *out* Y:**V** *out* Mean:**U** *out* InvStdDev:**U**|17+|**T** = tensor(float), tensor(float16) **U** = tensor(float)|
-|||1+|**T** = tensor(float), tensor(float16) **V** = tensor(float), tensor(float16)|
+|||1+|**T** = tensor(float), tensor(float16) **U** = tensor(float), tensor(float16) **V** = tensor(float), tensor(float16)|
|LeakyRelu|*in* X:**T** *out* Y:**T**|16+|**T** = tensor(float), tensor(float16)|
|||6+|**T** = tensor(float), tensor(float16)|
|Less|*in* A:**T** *in* B:**T** *out* C:**T1**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(bool)|
@@ -1029,7 +1079,8 @@ Do not modify directly.*
|||11+|**T** = tensor(float), tensor(float16)|
|||1+|**T** = tensor(float), tensor(float16)|
|LpNormalization|*in* input:**T** *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
-|LpPool|*in* X:**T** *out* Y:**T**|11+|**T** = tensor(float), tensor(float16)|
+|LpPool|*in* X:**T** *out* Y:**T**|18+|**T** = tensor(float), tensor(float16)|
+|||11+|**T** = tensor(float), tensor(float16)|
|||2+|**T** = tensor(float), tensor(float16)|
|MatMul|*in* A:**T** *in* B:**T** *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
|||9+|**T** = tensor(float), tensor(float16)|
@@ -1089,8 +1140,9 @@ Do not modify directly.*
|||12+|**T** = tensor(float), tensor(float16), tensor(int32) **T1** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
|||7+|**T** = tensor(float), tensor(float16)|
|QLinearConv|*in* x:**T1** *in* x_scale:**tensor(float)** *in* x_zero_point:**T1** *in* w:**T2** *in* w_scale:**tensor(float)** *in* w_zero_point:**T2** *in* y_scale:**tensor(float)** *in* y_zero_point:**T3** *in* B:**T4** *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8) **T2** = tensor(int8), tensor(uint8) **T3** = tensor(int8), tensor(uint8) **T4** = tensor(int32)|
-|QLinearMatMul|*in* a:**T1** *in* a_scale:**tensor(float)** *in* a_zero_point:**T1** *in* b:**T2** *in* b_scale:**tensor(float)** *in* b_zero_point:**T2** *in* y_scale:**tensor(float)** *in* y_zero_point:**T3** *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8) **T2** = tensor(int8), tensor(uint8) **T3** = tensor(int8), tensor(uint8)|
-|QuantizeLinear|*in* x:**T1** *in* y_scale:**T1** *in* y_zero_point:**T2** *out* y:**T2** or *in* x:**T1** *in* y_scale:**tensor(float)** *in* y_zero_point:**T2** *out* y:**T2**|13+|**T1** = tensor(float), tensor(int32) **T2** = tensor(int8), tensor(uint8)|
+|QLinearMatMul|*in* a:**T1** *in* a_scale:**TS** *in* a_zero_point:**T1** *in* b:**T2** *in* b_scale:**TS** *in* b_zero_point:**T2** *in* y_scale:**TS** *in* y_zero_point:**T3** *out* y:**T3** or *in* a:**T1** *in* a_scale:**tensor(float)** *in* a_zero_point:**T1** *in* b:**T2** *in* b_scale:**tensor(float)** *in* b_zero_point:**T2** *in* y_scale:**tensor(float)** *in* y_zero_point:**T3** *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8) **T2** = tensor(int8), tensor(uint8) **T3** = tensor(int8), tensor(uint8)|
+|QuantizeLinear|*in* x:**T1** *in* y_scale:**T1** *in* y_zero_point:**T2** *out* y:**T2** or *in* x:**T1** *in* y_scale:**tensor(float)** *in* y_zero_point:**T2** *out* y:**T2**|19+|**T1** = tensor(float), tensor(float16), tensor(int32) **T2** = tensor(int8), tensor(uint8)|
+|||13+|**T1** = tensor(float), tensor(int32) **T2** = tensor(int8), tensor(uint8)|
|||10+|**T1** = tensor(float), tensor(int32) **T2** = tensor(int8), tensor(uint8)|
|RNN|*in* X:**T** *in* W:**T** *in* R:**T** *in* B:**T** *in* sequence_lens:**T1** *in* initial_h:**T** *out* Y:**T** *out* Y_h:**T**|14+|**T** = tensor(float), tensor(float16)|
|||7+|**T** = tensor(float), tensor(float16)|
@@ -1113,7 +1165,8 @@ Do not modify directly.*
|||13+|**T** = tensor(float), tensor(float16)|
|||11+|**T** = tensor(float), tensor(float16)|
|||1+|**T** = tensor(float), tensor(float16)|
-|ReduceMax|*in* data:**T** *in* axes:**tensor(int64)** *out* reduced:**T** or *in* data:**T** *out* reduced:**T**|18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|ReduceMax|*in* data:**T** *in* axes:**tensor(int64)** *out* reduced:**T** or *in* data:**T** *out* reduced:**T**|20+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||11+|**T** = tensor(float), tensor(float16)|
@@ -1141,11 +1194,12 @@ Do not modify directly.*
|Relu|*in* X:**T** *out* Y:**T**|14+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8)|
|||13+|**T** = tensor(float), tensor(float16)|
|||6+|**T** = tensor(float), tensor(float16)|
-|Reshape|*in* data:**T** *in* shape:**tensor(int64)** *out* reshaped:**T** or *in* data:**T** *out* reshaped:**T**|14+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Reshape|*in* data:**T** *in* shape:**tensor(int64)** *out* reshaped:**T** or *in* data:**T** *out* reshaped:**T**|19+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||14+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||5+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Resize|*in* X:**T** *in* scales:**tensor(float)** *out* Y:**T** or *in* X:**T1** *in* roi:**T2** *in* scales:**tensor(float)** *in* sizes:**tensor(int64)** *out* Y:**T1**|13+|**T1** = tensor(float), tensor(float16) **T2** = tensor(float), tensor(float16)|
-|||11+|**T1** = tensor(float), tensor(float16) **T2** = tensor(float), tensor(float16)|
+|Resize|*in* X:**T** *in* scales:**tensor(float)** *out* Y:**T** or *in* X:**T1** *in* roi:**T2** *in* scales:**tensor(float)** *in* sizes:**tensor(int64)** *out* Y:**T1**|13+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8) **T2** = tensor(float), tensor(float16)|
+|||11+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8) **T2** = tensor(float), tensor(float16)|
|||10+|**T** = tensor(float), tensor(float16)|
|ReverseSequence|*in* input:**T** *in* sequence_lens:**tensor(int64)** *out* Y:**T**|10+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|RoiAlign|*in* X:**T1** *in* rois:**T1** *in* batch_indices:**T2** *out* Y:**T1**|16+|**T1** = tensor(float), tensor(float16) **T2** = tensor(int32), tensor(int64)|
@@ -1169,7 +1223,8 @@ Do not modify directly.*
|SequenceErase|*in* input_sequence:**S** *in* position:**I** *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64) **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
|SequenceInsert|*in* input_sequence:**S** *in* tensor:**T** *in* position:**I** *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64) **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
|SequenceLength|*in* input_sequence:**S** *out* length:**I**|11+|**I** = tensor(int64) **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
-|Shape|*in* data:**T** *out* shape:**T1**|15+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
+|Shape|*in* data:**T** *out* shape:**T1**|19+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
+|||15+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
|||13+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
|||1+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
|Shrink|*in* input:**T** *out* output:**T**|9+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
@@ -1177,9 +1232,11 @@ Do not modify directly.*
|||6+|**T** = tensor(float), tensor(float16)|
|Sign|*in* input:**T** *out* output:**T**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||9+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|SimplifiedLayerNormalization|*in* X:**T** *in* scale:**V** *out* Y:**V** *out* inv_std_var:**U**|1+|**T** = tensor(float), tensor(float16) **U** = tensor(float), tensor(float16) **V** = tensor(float), tensor(float16)|
|Sin|*in* input:**T** *out* output:**T**|7+|**T** = tensor(float), tensor(float16)|
|Sinh|*in* input:**T** *out* output:**T**|9+|**T** = tensor(float), tensor(float16)|
-|Size|*in* data:**T** *out* size:**T1**|13+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
+|Size|*in* data:**T** *out* size:**T1**|19+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
+|||13+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
|||1+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **T1** = tensor(int64)|
|Slice|*in* data:**T** *in* starts:**Tind** *in* ends:**Tind** *in* axes:**Tind** *in* steps:**Tind** *out* output:**T** or *in* data:**T** *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **Tind** = tensor(int32), tensor(int64)|
|||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8) **Tind** = tensor(int32), tensor(int64)|
@@ -1238,19 +1295,29 @@ Do not modify directly.*
|BiasSplitGelu|*in* X:**T** *in* bias:**T** *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
|ConvTransposeWithDynamicPads|*in* X:**T** *in* W:**T** *in* Pads:**tensor(int64)** *in* B:**T** *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
|DequantizeLinear|*in* x:**T1** *in* x_scale:**T2** *in* x_zero_point:**T1** *out* y:**T2**|1+|**T1** = tensor(int32), tensor(int8), tensor(uint8) **T2** = tensor(float), tensor(float16)|
+|DynamicQuantizeMatMul|*in* A:**T1** *in* B:**T2** *in* b_scale:**T1** *in* b_zero_point:**T2** *in* bias:**T1** *out* Y:**T1**|1+|**T1** = tensor(float) **T2** = tensor(int8), tensor(uint8)|
|EmbedLayerNormalization|*in* input_ids:**T1** *in* segment_ids:**T1** *in* word_embedding:**T** *in* position_embedding:**T** *in* segment_embedding:**T** *in* gamma:**T** *in* beta:**T** *in* mask:**T1** *in* position_ids:**T1** *out* output:**T** *out* mask_index:**T1** *out* embedding_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
+|FastGelu|*in* X:**T** *in* bias:**T** *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
|FusedMatMul|*in* A:**T** *in* B:**T** *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
|FusedMatMulActivation|*in* A:**T** *in* B:**T** *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
|Gelu|*in* X:**T** *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
|GroupNorm|*in* X:**T** *in* gamma:**M** *in* beta:**M** *out* Y:**T**|1+|**M** = tensor(float), tensor(float16) **T** = tensor(float), tensor(float16)|
+|GroupQueryAttention|*in* query:**T** *in* key:**T** *in* value:**T** *in* past_key:**T** *in* past_value:**T** *in* seqlens_k:**M** *in* total_sequence_length:**M** *in* cos_cache:**T** *in* sin_cache:**T** *out* output:**T** *out* present_key:**T** *out* present_value:**T**|1+|**M** = tensor(int32) **T** = tensor(float), tensor(float16)|
+|MatMulIntegerToFloat|*in* A:**T1** *in* B:**T2** *in* a_scale:**T3** *in* b_scale:**T3** *in* a_zero_point:**T1** *in* b_zero_point:**T2** *in* bias:**T3** *out* Y:**T3**|1+|**T1** = tensor(int8), tensor(uint8) **T2** = tensor(int8), tensor(uint8) **T3** = tensor(float), tensor(float16)|
+|MatMulNBits|*in* A:**T1** *in* B:**T2** *in* scales:**T1** *in* zero_points:**T3** *in* g_idx:**T4** *in* bias:**T1** *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16) **T2** = tensor(uint8)|
|MultiHeadAttention|*in* query:**T** *in* key:**T** *in* value:**T** *in* bias:**T** *in* key_padding_mask:**M** *in* relative_position_bias:**T** *in* past_key:**T** *in* past_value:**T** *out* output:**T** *out* present_key:**T** *out* present_value:**T**|1+|**M** = tensor(int32) **T** = tensor(float), tensor(float16)|
|NhwcConv|*in* X:**T** *in* W:**T** *in* B:**T** *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|QAttention|*in* input:**T1** *in* weight:**T2** *in* bias:**T3** *in* input_scale:**T3** *in* weight_scale:**T3** *in* mask_index:**T4** *in* input_zero_point:**T1** *in* weight_zero_point:**T2** *in* past:**T3** *out* output:**T3** *out* present:**T3**|1+|**T1** = tensor(int8), tensor(uint8) **T2** = tensor(int8), tensor(uint8) **T3** = tensor(float), tensor(float16) **T4** = tensor(int32)|
|QLinearAdd|*in* A:**T** *in* A_scale:**tensor(float)** *in* A_zero_point:**T** *in* B:**T** *in* B_scale:**tensor(float)** *in* B_zero_point:**T** *in* C_scale:**tensor(float)** *in* C_zero_point:**T** *out* C:**T**|1+|**T** = tensor(int8), tensor(uint8)|
+|QLinearAveragePool|*in* X:**T** *in* x_scale:**tensor(float)** *in* x_zero_point:**T** *in* y_scale:**tensor(float)** *in* y_zero_point:**T** *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
+|QLinearConcat|*in* Y_scale:**TF** *in* Y_zero_point:**T8** *in* inputs:**TV** *out* Y:**T8**|1+|**T8** = tensor(int8), tensor(uint8) **TF** = tensor(float) **TV** = tensor(float), tensor(int8), tensor(uint8)|
+|QLinearGlobalAveragePool|*in* X:**T** *in* x_scale:**tensor(float)** *in* x_zero_point:**T** *in* y_scale:**tensor(float)** *in* y_zero_point:**T** *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
|QLinearSigmoid|*in* X:**T** *in* X_scale:**tensor(float)** *in* X_zero_point:**T** *in* Y_scale:**tensor(float)** *in* Y_zero_point:**T** *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
|QuantizeLinear|*in* x:**T1** *in* y_scale:**T1** *in* y_zero_point:**T2** *out* y:**T2**|1+|**T1** = tensor(float), tensor(float16), tensor(int32) **T2** = tensor(int8), tensor(uint8)|
|QuickGelu|*in* X:**T** *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
|RotaryEmbedding|*in* input:**T** *in* position_ids:**M** *in* cos_cache:**T** *in* sin_cache:**T** *out* output:**T**|1+|**M** = tensor(int64) **T** = tensor(float), tensor(float16)|
|SkipLayerNormalization|*in* input:**T** *in* skip:**T** *in* gamma:**T** *in* beta:**T** *in* bias:**T** *out* output:**T** *out* mean:**U** *out* inv_std_var:**U** *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
+|SkipSimplifiedLayerNormalization|*in* input:**T** *in* skip:**T** *in* gamma:**T** *in* bias:**T** *out* output:**T** *out* mean:**U** *out* inv_std_var:**U** *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
| |
| |
|**Operator Domain:** *com.microsoft.dml*||||
diff --git a/docs/Python_Dev_Notes.md b/docs/Python_Dev_Notes.md
deleted file mode 100644
index 78804bd9f2f86..0000000000000
--- a/docs/Python_Dev_Notes.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# Python Dev Notes
-
-Each Python version uses a specific compiler version. In most cases, you should use the same compiler version for building python extensions.
-
-## Which Microsoft Visual C++ compiler to use with a specific Python version ?
-
-| Visual C++ | CPython |
-|-------------|:-----------------------:|
-|2015, 2017 | 3.7 |
-|2015 | 3.5,3.6 |
-|2010 | 3.3,3.4 |
-|2008 | 2.6, 2.7, 3.0, 3.1, 3.2 |
-
-Currently, the official ONNXRuntime Python wheel (v1.3.0 onwards) hosted on PyPi requires [Visual C++ 2019 runtime ](https://support.microsoft.com/en-us/help/2977003/the-latest-supported-visual-c-downloads) installed on the target machine.
-
-If the Python wheel is built from source using the build toolset provided with Visual Studio 2017, it will work with the Visual C++ 2017 runtime.
-
-CPython 3.7 is distributed with a VC++ 2017 runtime. Unlike the earlier VC++ version, VC++ 2017 Runtime is binary backward compatible with VC++ 2015. Which means you could build your application with VC++ 2015 then run it with VC++ 2017 runtime.
diff --git a/docs/python/README.rst b/docs/python/README.rst
index 32bb3729e01d0..6c493e206a493 100644
--- a/docs/python/README.rst
+++ b/docs/python/README.rst
@@ -8,6 +8,16 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime ;
+// TODO: When other compilers support std::chrono::operator<<, update this.
+// TODO: Check support for other compilers' version before enable C++20 for other compilers.
+// Xcode added support for C++20's std::chrono::operator<< in SDK version 14.4.
+#if __cplusplus >= 202002L && __MAC_OS_X_VERSION_MAX_ALLOWED >= 140400L
+namespace timestamp_ns = std::chrono;
+#else
+namespace timestamp_ns = ::date;
+#endif
+
#ifndef NDEBUG
ORT_ATTRIBUTE_UNUSED static bool vlog_enabled = true; // Set directly based on your needs.
#else
@@ -75,6 +86,21 @@ struct Category {
// TODO: What other high level categories are meaningful? Model? Optimizer? Execution?
};
+///
+/// ORT TraceLogging keywords for categories of dynamic logging enablement
+///
+enum class ORTTraceLoggingKeyword : uint64_t {
+ Session = 0x1, // ORT Session TraceLoggingWrite
+ Logs = 0x2, // LOGS() Macro ORT logs. Pair with an appropriate level depending on detail required
+ Reserved1 = 0x4, // Reserved if we want to add some specific sub-categories instead of just LOGS() or other uses
+ Reserved2 = 0x8,
+ Reserved3 = 0x10,
+ Reserved4 = 0x20,
+ Reserved5 = 0x40,
+ Reserved6 = 0x80,
+ Profiling = 0x100 // Enables profiling. At higher levels >5 can impact inference performance
+};
+
class ISink;
class Logger;
class Capture;
@@ -333,5 +359,17 @@ unsigned int GetThreadId();
*/
unsigned int GetProcessId();
+/**
+ If the ONNXRuntimeTraceLoggingProvider ETW Provider is enabled, then adds to the existing logger.
+*/
+std::unique_ptr EnhanceLoggerWithEtw(std::unique_ptr existingLogger, logging::Severity originalSeverity,
+ logging::Severity etwSeverity);
+
+/**
+ If the ONNXRuntimeTraceLoggingProvider ETW Provider is enabled, then can override the logging level.
+ But this overrided level only applies to the ETW sink. The original logger(s) retain their original logging level
+*/
+Severity OverrideLevelWithEtw(Severity originalSeverity);
+
} // namespace logging
} // namespace onnxruntime
diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
index cbc2208b6bbd7..097873c5e3653 100644
--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@@ -3,12 +3,14 @@
#pragma once
+#include
+
#include "core/common/common.h"
#include "core/framework/allocator_stats.h"
+// some enums are defined in session/onnxruntime_c_api.h but used in ortdevice.h/ortmemory.h
#include "core/session/onnxruntime_c_api.h"
-#include "ortdevice.h"
-#include "ortmemoryinfo.h"
-#include
+#include "core/framework/ortdevice.h"
+#include "core/framework/ortmemoryinfo.h"
// This configures the arena based allocator used by ORT
// See docs/C_API.md for details on what these mean and how to choose these values
@@ -68,13 +70,16 @@ class IAllocator {
IAllocator(const OrtMemoryInfo& info) : memory_info_(info) {}
virtual ~IAllocator() = default;
/**
- @remarks Use SafeInt when calculating the size of memory to allocate using Alloc.
- */
+ * Allocate memory of the specified size.
+ * If size is 0, nullptr is returned.
+ * If allocation fails, an exception is thrown.
+ *
+ * @remarks Use SafeInt when calculating the size of memory to allocate using Alloc.
+ */
virtual void* Alloc(size_t size) = 0;
virtual void Free(void* p) = 0;
- // TODO: Find a better name than Reserve() and update in all places.
// Reserve() is an interface exposed for an implementation of IAllocator
// to optionally implement some allocation logic that by-passes any arena-based
// logic that may be housed in the Alloc() implementation.
@@ -100,7 +105,8 @@ class IAllocator {
* \param out Total size required after any alignment is applied
* \return true, successful. false, overflow
*/
- [[nodiscard]] static bool CalcMemSizeForArrayWithAlignment(size_t nmemb, size_t size, size_t alignment, size_t* out) noexcept;
+ [[nodiscard]] static bool CalcMemSizeForArrayWithAlignment(size_t nmemb, size_t size, size_t alignment,
+ size_t* out) noexcept;
/**
* https://cwe.mitre.org/data/definitions/190.html
@@ -120,8 +126,10 @@ class IAllocator {
*/
void* AllocArray(size_t nmemb, size_t size) {
size_t len;
- if (!CalcMemSizeForArray(nmemb, size, &len))
- return nullptr;
+ if (!CalcMemSizeForArray(nmemb, size, &len)) {
+ ORT_THROW("Invalid size requested for allocation: ", nmemb, " * ", size);
+ }
+
return Alloc(len);
}
@@ -131,8 +139,10 @@ class IAllocator {
template
void* AllocArrayWithAlignment(size_t nmemb, size_t size) {
size_t len;
- if (!CalcMemSizeForArrayWithAlignment(nmemb, size, alignment, &len))
- return nullptr;
+ if (!CalcMemSizeForArrayWithAlignment(nmemb, size, alignment, &len)) {
+ ORT_THROW("Invalid size requested for allocation: ", nmemb, " * ", size, " with alignment ", alignment);
+ }
+
return Alloc(len);
}
@@ -144,13 +154,14 @@ class IAllocator {
@param stream Which stream instance allocated chunk will be used with.
@param wait_fn If the allocator want to dynamic reuse a chunk from another stream, use this wait_fn to sync on
the target stream to make the reuse safe.
- @returns std::unique_ptr with allocated memory and deleter.
+ @returns std::unique_ptr with allocated memory and deleter. Throws if it cannot allocate memory.
*/
template
static IAllocatorUniquePtr MakeUniquePtr(std::shared_ptr allocator, size_t count_or_bytes,
bool use_reserve = false,
Stream* stream = nullptr, WaitNotificationFn wait_fn = nullptr) {
- if (allocator == nullptr) return nullptr;
+ ValidateAllocator(allocator);
+
// for now limit to fundamental types. we could support others, but to do so either we or the caller
// needs to call the dtor for the objects, for buffers allocated on device we don't have destructor
// static_assert(std::is_fundamental::value, "Fundamental type required as no destructors are called.");
@@ -161,38 +172,73 @@ class IAllocator {
if constexpr (!std::is_void::value) {
// sizeof(void) isn't valid, but the compiler isn't smart enough to ignore that this line isn't
// reachable if T is void. use std::conditional to 'use' void* in the sizeof call
- if (!CalcMemSizeForArray(
- count_or_bytes, sizeof(typename std::conditional::value, void*, T>::type), &alloc_size)) {
- return nullptr;
- }
+ constexpr auto size = sizeof(typename std::conditional::value, void*, T>::type);
+ alloc_size = ValidatedCalcMemSizeForArray(count_or_bytes, size);
}
// allocate
T* p = static_cast(AllocateBufferWithOptions(*allocator, alloc_size, use_reserve, stream, std::move(wait_fn)));
- return IAllocatorUniquePtr{
- p,
- [allocator = std::move(allocator)](T* p) { allocator->Free(p); }};
+ ValidateAllocation(p, alloc_size);
+
+ return IAllocatorUniquePtr{p,
+ [allocator = std::move(allocator)](T* p) {
+ allocator->Free(p);
+ }};
}
+ /**
+ Create a std::unique_ptr that is allocated and freed by the provided OrtAllocator.
+ @param ort_allocator The allocator.
+ @param count_or_bytes The exact bytes to allocate if T is void, otherwise the number of elements to allocate.
+ @returns std::unique_ptr with allocated memory and deleter. Throws if it cannot allocate memory.
+ */
template
static IAllocatorUniquePtr MakeUniquePtrFromOrtAllocator(OrtAllocator* ort_allocator, size_t count_or_bytes) {
- if (!ort_allocator) return nullptr;
+ ValidateAllocator(ort_allocator);
size_t alloc_size = count_or_bytes;
// if T is not void, 'count_or_bytes' == number of items so allow for that
if constexpr (!std::is_void::value) {
// sizeof(void) isn't valid, but the compiler isn't smart enough to ignore that this line isn't
// reachable if T is void. use std::conditional to 'use' void* in the sizeof call
- if (!CalcMemSizeForArray(
- count_or_bytes, sizeof(typename std::conditional::value, void*, T>::type), &alloc_size)) {
- return nullptr;
- }
+ constexpr auto size = sizeof(typename std::conditional::value, void*, T>::type);
+ alloc_size = ValidatedCalcMemSizeForArray(count_or_bytes, size);
}
- T* p = static_cast(ort_allocator->Alloc(ort_allocator, count_or_bytes));
- return IAllocatorUniquePtr{p, [ort_allocator](T* p) { ort_allocator->Free(ort_allocator, p); }};
+
+ T* p = static_cast(ort_allocator->Alloc(ort_allocator, alloc_size));
+ ValidateAllocation(p, alloc_size);
+
+ return IAllocatorUniquePtr{p,
+ [ort_allocator](T* p) {
+ ort_allocator->Free(ort_allocator, p);
+ }};
}
private:
+ //
+ // validation functions. split out from methods that are templatized on the data type to minimize binary size.
+ //
+
+ template
+ static void ValidateAllocator(const T& allocator) {
+ ORT_ENFORCE(allocator != nullptr);
+ }
+
+ static size_t ValidatedCalcMemSizeForArray(size_t count, size_t size) {
+ size_t alloc_size = 0;
+ if (!CalcMemSizeForArray(count, size, &alloc_size)) {
+ ORT_THROW("Invalid size requested for allocation: ", count, " * ", size);
+ }
+
+ return alloc_size;
+ }
+
+ static void ValidateAllocation(void* p, size_t size) {
+ // allocator should throw directly but in case it didn't ensure we do here so that calling code doesn't
+ // need to check for nullptr when an actual allocation was expected.
+ ORT_ENFORCE(p != nullptr || size == 0, "Memory allocation failed. Size=", size);
+ };
+
OrtMemoryInfo memory_info_;
};
diff --git a/include/onnxruntime/core/framework/data_types.h b/include/onnxruntime/core/framework/data_types.h
index f3942128077de..b197d88090432 100644
--- a/include/onnxruntime/core/framework/data_types.h
+++ b/include/onnxruntime/core/framework/data_types.h
@@ -15,6 +15,7 @@
#include "core/framework/endian.h"
#include "core/framework/float8.h"
#include "core/framework/float16.h"
+#include "core/framework/int4.h"
#include "core/graph/onnx_protobuf.h"
#include "core/framework/to_tensor_proto_element_type.h"
@@ -280,7 +281,8 @@ struct IsAnyOf {
template
struct IsTensorContainedType : public IsAnyOf {
* Base class for primitive Tensor contained types
*
* \details This class contains an integer constant that can be
- * used for input data type dispatching
+ * used for input data type dispatching. This class also stores the number of subelements per size units.
+ * Example: For int4, the size unit is 1 byte and the number of subelements is 2.
*
*/
class PrimitiveDataTypeBase : public DataTypeImpl {
@@ -934,12 +937,21 @@ class PrimitiveDataTypeBase : public DataTypeImpl {
return data_type_;
}
+ int32_t GetNumSubElems() const {
+ return num_sub_elems_;
+ }
+
+ bool HasSubElems() const {
+ return num_sub_elems_ > 1;
+ }
+
protected:
- PrimitiveDataTypeBase(size_t size, int32_t data_type)
- : DataTypeImpl{GeneralType::kPrimitive, size}, data_type_{data_type} {}
+ PrimitiveDataTypeBase(size_t size, int32_t data_type, int32_t num_sub_elems)
+ : DataTypeImpl{GeneralType::kPrimitive, size}, data_type_{data_type}, num_sub_elems_{num_sub_elems} {}
private:
const int32_t data_type_;
+ const int32_t num_sub_elems_; // > 1 for subbyte primitives, 1 for normal primitives.
};
/**
@@ -965,9 +977,9 @@ class PrimitiveDataType : public PrimitiveDataTypeBase {
}
private:
- PrimitiveDataType()
+ explicit PrimitiveDataType(int32_t num_sub_elems)
: PrimitiveDataTypeBase{sizeof(T),
- utils::ToTensorProtoElementType()} {
+ utils::ToTensorProtoElementType(), num_sub_elems} {
}
};
@@ -1074,15 +1086,30 @@ inline const PrimitiveDataTypeBase* DataTypeImpl::AsPrimitiveDataType() const {
return SequenceTensorType::Type(); \
}
-#define ORT_REGISTER_PRIM_TYPE(TYPE) \
- template <> \
- MLDataType PrimitiveDataType::Type() { \
- static PrimitiveDataType prim_data_type; \
- return &prim_data_type; \
- } \
- template <> \
- MLDataType DataTypeImpl::GetType() { \
- return PrimitiveDataType::Type(); \
+#define ORT_REGISTER_PRIM_TYPE(TYPE) \
+ template <> \
+ MLDataType PrimitiveDataType::Type() { \
+ static PrimitiveDataType prim_data_type(1); \
+ return &prim_data_type; \
+ } \
+ template <> \
+ MLDataType DataTypeImpl::GetType() { \
+ return PrimitiveDataType::Type(); \
+ }
+
+// Registers a subbyte primitive.
+// Examples:
+// - Int4x2 stores 2 packed 4-bit elements in 1 byte: ORT_*_SUBBYTE_TYPE(Int4x2, 2)
+// - [not supported] Int3x8 could store 8 packed 3-bit elements in 3 bytes: ORT_*_SUBBYTE_TYPE(Int3x8, 8)
+#define ORT_REGISTER_PRIM_SUBBYTE_TYPE(TYPE, NUM_SUB_ELEMS) \
+ template <> \
+ MLDataType PrimitiveDataType::Type() { \
+ static PrimitiveDataType prim_data_type(NUM_SUB_ELEMS); \
+ return &prim_data_type; \
+ } \
+ template <> \
+ MLDataType DataTypeImpl::GetType() { \
+ return PrimitiveDataType::Type(); \
}
#define ORT_REGISTER_OPAQUE_TYPE(CPPType, Domain, Name) \
diff --git a/include/onnxruntime/core/framework/data_types_internal.h b/include/onnxruntime/core/framework/data_types_internal.h
index fbeee8a2aedc5..05f4c10995ef2 100644
--- a/include/onnxruntime/core/framework/data_types_internal.h
+++ b/include/onnxruntime/core/framework/data_types_internal.h
@@ -93,6 +93,12 @@ namespace utils {
case ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2FNUZ: \
function(__VA_ARGS__); \
break; \
+ case ONNX_NAMESPACE::TensorProto_DataType_INT4: \
+ function(__VA_ARGS__); \
+ break; \
+ case ONNX_NAMESPACE::TensorProto_DataType_UINT4: \
+ function(__VA_ARGS__); \
+ break; \
default: \
ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type); \
}
@@ -153,6 +159,12 @@ namespace utils {
case ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2FNUZ: \
retval = function(__VA_ARGS__); \
break; \
+ case ONNX_NAMESPACE::TensorProto_DataType_INT4: \
+ retval = function(__VA_ARGS__); \
+ break; \
+ case ONNX_NAMESPACE::TensorProto_DataType_UINT4: \
+ retval = function(__VA_ARGS__); \
+ break; \
default: \
ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type); \
}
@@ -203,6 +215,12 @@ namespace utils {
case ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16: \
function(__VA_ARGS__); \
break; \
+ case ONNX_NAMESPACE::TensorProto_DataType_INT4: \
+ function(__VA_ARGS__); \
+ break; \
+ case ONNX_NAMESPACE::TensorProto_DataType_UINT4: \
+ function(__VA_ARGS__); \
+ break; \
default: \
ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type); \
}
@@ -251,6 +269,12 @@ namespace utils {
case ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16: \
retval = function(__VA_ARGS__); \
break; \
+ case ONNX_NAMESPACE::TensorProto_DataType_INT4: \
+ retval = function(__VA_ARGS__); \
+ break; \
+ case ONNX_NAMESPACE::TensorProto_DataType_UINT4: \
+ retval = function(__VA_ARGS__); \
+ break; \
default: \
ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type); \
}
@@ -305,7 +329,7 @@ class CallableDispatchableHelper {
return 0;
}
- void CheckCalledOnce() {
+ void CheckCalledOnce() const {
ORT_ENFORCE(called_ == 1, "Unsupported data type: ", dt_type_);
}
};
diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index ea4f52f99649d..16ad943a5f47e 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -33,6 +33,8 @@ class Node;
#include "core/framework/stream_handles.h"
#include "core/framework/tuning_context.h"
+struct OrtRunOptions;
+
namespace onnxruntime {
/**
@@ -51,6 +53,8 @@ struct NodeComputeInfo {
DestroyFunctionStateFunc release_state_func;
};
+using RunOptions = ::OrtRunOptions;
+
enum class DataLayout {
NCHW,
NHWC,
@@ -59,14 +63,11 @@ enum class DataLayout {
class IExecutionProvider {
protected:
- IExecutionProvider(const std::string& type, bool use_metadef_id_creator = false)
- : IExecutionProvider(type, OrtDevice(), use_metadef_id_creator) {}
+ IExecutionProvider(const std::string& type)
+ : IExecutionProvider(type, OrtDevice()) {}
- IExecutionProvider(const std::string& type, OrtDevice device, bool use_metadef_id_creator = false)
+ IExecutionProvider(const std::string& type, OrtDevice device)
: default_device_(device), type_{type} {
- if (use_metadef_id_creator) {
- metadef_id_generator_ = std::make_unique();
- }
}
/*
@@ -187,7 +188,7 @@ class IExecutionProvider {
Run may not be finished on device This function should be regarded as the
point after which a new Run would start to submit commands from CPU
*/
- virtual common::Status OnRunStart() { return Status::OK(); }
+ virtual common::Status OnRunStart(const onnxruntime::RunOptions& /*run_options*/) { return Status::OK(); }
/**
Called when InferenceSession::Run ended
@@ -195,25 +196,27 @@ class IExecutionProvider {
may not be finished on device This function should be regarded as the point
that all commands of current Run has been submmited by CPU
*/
- virtual common::Status OnRunEnd(bool /*sync_stream*/) { return Status::OK(); }
+ virtual common::Status OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& /*run_options*/) {
+ return Status::OK();
+ }
/**
Indicate whether the graph capturing mode (e.g., cuda graph) is enabled for
- the provider. Currently only CUDA execution provider supports it.
+ the provider.
*/
virtual bool IsGraphCaptureEnabled() const { return false; }
/**
- Indicate whether the graph has been captured and instantiated. Currently
- only CUDA execution provider supports it.
+ Indicate whether the graph has been captured and instantiated.
*/
- virtual bool IsGraphCaptured() const { return false; }
+ virtual bool IsGraphCaptured(int /*graph_annotation_id*/) const { return false; }
/**
- Run the instantiated graph. Currently only CUDA execution provider supports
- it.
+ Run the instantiated graph.
*/
- virtual common::Status ReplayGraph() { return Status::OK(); }
+ virtual common::Status ReplayGraph(int /*graph_annotation_id*/) {
+ return Status::OK();
+ }
/**
Called when session creation is complete
@@ -274,19 +277,6 @@ class IExecutionProvider {
return logger_;
}
- /** Generate a unique id that can be used in a MetaDef name. Values are unique for a model instance.
- The model hash is also returned if you wish to include that in the MetaDef name to ensure uniqueness across models.
- @param graph_viewer[in] Graph viewer that GetCapability was called with. Can be for the main graph or nested graph.
- @param model_hash[out] Returns the hash for the main (i.e. top level) graph in the model.
- This is created using the model path if available,
- or the model input names and the output names from all nodes in the main graph.
- @remarks e.g. the TensorRT Execution Provider is used in multiple sessions and the underlying infrastructure caches
- compiled kernels, so the name must be unique and deterministic across models and sessions.
- NOTE: Ideally this would be a protected method, but to work across the EP bridge it has to be public and
- virtual, and ModelMetadefIdGenerator but be defined in the header as well.
- */
- virtual int GenerateMetaDefId(const onnxruntime::GraphViewer& graph_viewer, HashValue& model_hash) const;
-
virtual std::unique_ptr GetProfiler() {
return {};
}
@@ -326,23 +316,19 @@ class IExecutionProvider {
*/
virtual std::vector CreatePreferredAllocators() { return std::vector(); };
+ /**
+ * Get the array of pointers for EPContext nodes
+ * EP needs to implement this if has the requirement to generate the context cache model. Otherwise leave it.
+ * Default return an empty vector if not provided by the Execution Provider
+ */
+ virtual const InlinedVector GetEpContextNodes() const {
+ return InlinedVector();
+ }
+
private:
const std::string type_;
// It will be set when this object is registered to a session
const logging::Logger* logger_ = nullptr;
-
- // helper to generate ids that are unique to model and deterministic, even if the execution provider is shared across
- // multiple sessions.
- class ModelMetadefIdGenerator {
- public:
- int GenerateId(const onnxruntime::GraphViewer& graph_viewer, HashValue& model_hash);
-
- private:
- std::unordered_map main_graph_hash_; // map graph instance hash to model contents hash
- std::unordered_map model_metadef_id_; // current unique id for model
- };
-
- std::unique_ptr metadef_id_generator_;
};
} // namespace onnxruntime
diff --git a/include/onnxruntime/core/framework/int4.h b/include/onnxruntime/core/framework/int4.h
new file mode 100644
index 0000000000000..aff365dc9738f
--- /dev/null
+++ b/include/onnxruntime/core/framework/int4.h
@@ -0,0 +1,159 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include
+#include
+#include "core/common/common.h"
+#include "core/common/gsl.h"
+
+namespace onnxruntime {
+
+template
+struct Int4Traits;
+
+template <>
+struct Int4Traits {
+ using UnpackedType = int8_t;
+ static constexpr int8_t min_val = -8;
+ static constexpr int8_t max_val = 7;
+};
+
+template <>
+struct Int4Traits {
+ using UnpackedType = uint8_t;
+ static constexpr uint8_t min_val = 0;
+ static constexpr uint8_t max_val = 15;
+};
+
+///
+/// Stores 2 packed 4-bit elements in 1 byte.
+///
+/// Set to true if signed int4, or false if unsigned uint4.
+template
+struct Int4x2Base {
+ using UnpackedType = typename Int4Traits::UnpackedType;
+ static constexpr UnpackedType min_val = Int4Traits::min_val;
+ static constexpr UnpackedType max_val = Int4Traits::max_val;
+
+ std::byte bits_{};
+
+ Int4x2Base() = default;
+
+ explicit Int4x2Base(std::byte bits) {
+ bits_ = bits;
+ }
+
+ Int4x2Base(UnpackedType val0, UnpackedType val1) {
+ bits_ = static_cast(((val1 & 0xF) << 4) | (val0 & 0xF));
+ }
+
+ static inline int8_t SignExtendLower4Bits(std::byte bits) {
+ // Sign-extend lower 4-bits by left shifting and then doing an arithmetic right shift.
+ constexpr uint8_t shift = (sizeof(int32_t) * 8) - 4;
+ return static_cast((static_cast(bits) << shift) >> shift);
+ }
+
+ inline UnpackedType GetElem(size_t index) const {
+ assert(index <= 1);
+ const uint8_t shift = 4 * static_cast(index);
+ const std::byte val = (bits_ >> shift) & std::byte{0xF};
+
+ if constexpr (Signed) {
+ return SignExtendLower4Bits(val);
+ } else {
+ return static_cast(val);
+ }
+ }
+
+ inline void SetElem(size_t index, UnpackedType val) {
+ assert(index <= 1);
+ const uint8_t shift = 4 * static_cast(index);
+ const std::byte mask = std::byte{0xF0} >> shift;
+
+ bits_ &= mask; // Clear 4-bit element to 0
+ bits_ |= static_cast((val & 0xF) << shift); // Set 4-bit element to val
+ }
+
+ inline std::byte ToBits() const {
+ return bits_;
+ }
+
+ static size_t CalcNumInt4Pairs(size_t num_int4_elems) {
+ return (num_int4_elems + 1) / 2;
+ }
+
+ ///
+ /// Copy a source buffer of 4-bit elements (packed) into a destination buffer of 8-bit elements (unpacked).
+ ///
+ /// Destination buffer to store unpacked 8-bit elements
+ /// Source buffer with 4-bit elements
+ /// True on success
+ static bool Unpack(gsl::span dst, gsl::span> src) {
+ if (CalcNumInt4Pairs(dst.size()) != src.size()) {
+ return false;
+ }
+
+ if (src.empty()) {
+ return true;
+ }
+
+ for (size_t i = 0; i < dst.size(); i++) {
+ size_t r = i >> 1; // i / 2;
+ size_t c = i & 0x1; // i % 2;
+ dst[i] = src[r].GetElem(c);
+ }
+
+ return true;
+ }
+
+ ///
+ /// Copy a source buffer of 8-bit elements (unpacked) into a destination buffer of 4-bit elements (packed).
+ ///
+ /// Destination buffer to store packed 4-bit elements
+ /// Source buffer with 8-bit elements
+ /// True on success
+ static bool Pack(gsl::span> dst, gsl::span src) {
+ if (CalcNumInt4Pairs(src.size()) != dst.size()) {
+ return false;
+ }
+
+ if (src.empty()) {
+ return true;
+ }
+
+ size_t src_i = 0;
+ size_t dst_i = 0;
+
+ for (; src_i < src.size() - 1; src_i += 2) {
+ dst[dst_i++] = Int4x2Base(src[src_i], src[src_i + 1]);
+ }
+
+ if (src_i < src.size()) {
+ dst[dst_i] = Int4x2Base(src[src_i], 0);
+ }
+
+ return true;
+ }
+
+ ///
+ /// Returns hierarchical indices for a packed int4 element from the given element index.
+ ///
+ /// Usage:
+ /// Int4x2* data = ...;
+ /// auto indices = GetTensorElemIndices(3); // 4th int4 element
+ /// int8_t elem = data[indices.first].GetElem(indices.second);
+ ///
+ /// Index of 4-bit element
+ /// Unpacked element
+ static inline std::pair GetTensorElemIndices(size_t index) {
+ return {index >> 1, index & 0x1};
+ }
+};
+
+using Int4x2 = Int4x2Base;
+using UInt4x2 = Int4x2Base;
+static_assert(sizeof(Int4x2) == sizeof(std::byte));
+static_assert(sizeof(UInt4x2) == sizeof(std::byte));
+} // namespace onnxruntime
diff --git a/include/onnxruntime/core/framework/op_kernel_info.h b/include/onnxruntime/core/framework/op_kernel_info.h
index b31c85e32f80c..a0bbfe50a700b 100644
--- a/include/onnxruntime/core/framework/op_kernel_info.h
+++ b/include/onnxruntime/core/framework/op_kernel_info.h
@@ -28,7 +28,8 @@ class OpKernelInfo : public OpNodeProtoHelper {
const std::unordered_map& constant_initialized_tensors,
const OrtValueNameIdxMap& mlvalue_name_idx_map,
const DataTransferManager& data_transfer_mgr,
- const AllocatorMap& allocators = {});
+ const AllocatorMap& allocators,
+ const ConfigOptions& config_options);
OpKernelInfo(const OpKernelInfo& other);
@@ -50,6 +51,8 @@ class OpKernelInfo : public OpNodeProtoHelper {
const AllocatorMap& GetAllocators() const { return allocators_; }
+ const ConfigOptions& GetConfigOptions() const { return config_options_; }
+
private:
ORT_DISALLOW_MOVE(OpKernelInfo);
ORT_DISALLOW_ASSIGNMENT(OpKernelInfo);
@@ -64,6 +67,7 @@ class OpKernelInfo : public OpNodeProtoHelper {
const DataTransferManager& data_transfer_mgr_;
ProtoHelperNodeContext proto_helper_context_;
const AllocatorMap& allocators_;
+ const ConfigOptions& config_options_;
};
} // namespace onnxruntime
diff --git a/include/onnxruntime/core/framework/run_options.h b/include/onnxruntime/core/framework/run_options.h
index 5444c825d7991..789c3b13f2c3e 100644
--- a/include/onnxruntime/core/framework/run_options.h
+++ b/include/onnxruntime/core/framework/run_options.h
@@ -45,5 +45,5 @@ struct OrtRunOptions {
};
namespace onnxruntime {
-using RunOptions = OrtRunOptions;
+using RunOptions = ::OrtRunOptions;
} // namespace onnxruntime
diff --git a/include/onnxruntime/core/framework/stream_handles.h b/include/onnxruntime/core/framework/stream_handles.h
index c235ee904762e..26d78133b52fc 100644
--- a/include/onnxruntime/core/framework/stream_handles.h
+++ b/include/onnxruntime/core/framework/stream_handles.h
@@ -100,6 +100,8 @@ class Stream {
return nullptr;
}
+ virtual WaitNotificationFn GetWaitNotificationFn() const { return nullptr; }
+
private:
StreamHandle handle_;
const OrtDevice& device_;
diff --git a/include/onnxruntime/core/framework/tensor.h b/include/onnxruntime/core/framework/tensor.h
index a867ab6066485..96725aa103064 100644
--- a/include/onnxruntime/core/framework/tensor.h
+++ b/include/onnxruntime/core/framework/tensor.h
@@ -145,6 +145,17 @@ class Tensor final {
/// Bytes required.
static size_t CalculateTensorStorageSize(MLDataType elt_type, const TensorShape& shape);
+ ///
+ /// Calculate the required storage for the tensor.
+ ///
+ /// Data type of the tensor elements.
+ /// Tensor shape.
+ /// Power of 2 alignment to include in calculation.
+ /// Bumps up result to the nearest multiple of alignment. Set to 0 to ignore.
+ /// The resulting storage size.
+ /// Status indicating success or failure.
+ static Status CalculateTensorStorageSize(MLDataType elt_type, const TensorShape& shape, size_t alignment,
+ size_t& storage_size);
/**
Returns the data type.
*/
@@ -200,7 +211,7 @@ class Tensor final {
ORT_ENFORCE(utils::IsPrimitiveDataType(dtype_), "Tensor type mismatch. ",
"T ", "!=", dtype_);
T* data = reinterpret_cast(static_cast(p_data_) + byte_offset_);
- return gsl::make_span(data, static_cast(shape_.Size()));
+ return gsl::make_span(data, static_cast(NumStorageElements()));
}
template
@@ -217,7 +228,7 @@ class Tensor final {
ORT_ENFORCE(utils::IsPrimitiveDataType(dtype_), "Tensor type mismatch. ",
"T ", "!=", dtype_);
const T* data = reinterpret_cast(static_cast(p_data_) + byte_offset_);
- return gsl::make_span(data, static_cast